diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 0cb3a80a6e892..cc471edc376b3 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.1.0 +Version: 2.2.0 Title: R Frontend for Apache Spark Description: The SparkR package provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), @@ -41,7 +41,13 @@ Collate: 'functions.R' 'install.R' 'jvm.R' - 'mllib.R' + 'mllib_classification.R' + 'mllib_clustering.R' + 'mllib_recommendation.R' + 'mllib_regression.R' + 'mllib_stat.R' + 'mllib_tree.R' + 'mllib_utils.R' 'serialize.R' 'sparkR.R' 'stats.R' diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 7737ffe4ed43b..c56648a8c4fba 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2313,9 +2313,9 @@ setMethod("dropDuplicates", #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a #' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is #' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead. -#' @param joinType The type of join to perform. The following join types are available: -#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left', -#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner". +#' @param joinType The type of join to perform, default 'inner'. +#' Must be one of: 'inner', 'cross', 'outer', 'full', 'full_outer', +#' 'left', 'left_outer', 'right', 'right_outer', 'left_semi', or 'left_anti'. #' @return A SparkDataFrame containing the result of the join operation. #' @family SparkDataFrame functions #' @aliases join,SparkDataFrame,SparkDataFrame-method @@ -2344,15 +2344,18 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { - if (joinType %in% c("inner", "outer", "full", "fullouter", - "leftouter", "left_outer", "left", - "rightouter", "right_outer", "right", "leftsemi")) { + if (joinType %in% c("inner", "cross", + "outer", "full", "fullouter", "full_outer", + "left", "leftouter", "left_outer", + "right", "rightouter", "right_outer", + "left_semi", "leftsemi", "left_anti", "leftanti")) { joinType <- gsub("_", "", joinType) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { stop("joinType must be one of the following types: ", - "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left', - 'rightouter', 'right_outer', 'right', 'leftsemi'") + "'inner', 'cross', 'outer', 'full', 'full_outer',", + "'left', 'left_outer', 'right', 'right_outer',", + "'left_semi', or 'left_anti'.") } } } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index bf5c96373c632..6ffa0f5481c65 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3150,7 +3150,8 @@ setMethod("cume_dist", #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the \code{DENSE_RANK} function in SQL. #' @@ -3321,10 +3322,11 @@ setMethod("percent_rank", #' #' Window function: returns the rank of rows within a window partition. #' -#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking -#' sequence when there are ties. That is, if you were ranking a competition using denseRank +#' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking +#' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the RANK function in SQL. #' diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R deleted file mode 100644 index d736bbb5e9113..0000000000000 --- a/R/pkg/R/mllib.R +++ /dev/null @@ -1,2114 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# mllib.R: Provides methods for MLlib integration - -# Integration with R's standard functions. -# Most of MLlib's argorithms are provided in two flavours: -# - a specialization of the default R methods (glm). These methods try to respect -# the inputs and the outputs of R's method to the largest extent, but some small differences -# may exist. -# - a set of methods that reflect the arguments of the other languages supported by Spark. These -# methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. - -#' S4 class that represents a generalized linear model -#' -#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper -#' @export -#' @note GeneralizedLinearRegressionModel since 2.0.0 -setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a NaiveBayesModel -#' -#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper -#' @export -#' @note NaiveBayesModel since 2.0.0 -setClass("NaiveBayesModel", representation(jobj = "jobj")) - -#' S4 class that represents an LDAModel -#' -#' @param jobj a Java object reference to the backing Scala LDAWrapper -#' @export -#' @note LDAModel since 2.1.0 -setClass("LDAModel", representation(jobj = "jobj")) - -#' S4 class that represents a AFTSurvivalRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper -#' @export -#' @note AFTSurvivalRegressionModel since 2.0.0 -setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a KMeansModel -#' -#' @param jobj a Java object reference to the backing Scala KMeansModel -#' @export -#' @note KMeansModel since 2.0.0 -setClass("KMeansModel", representation(jobj = "jobj")) - -#' S4 class that represents a MultilayerPerceptronClassificationModel -#' -#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper -#' @export -#' @note MultilayerPerceptronClassificationModel since 2.1.0 -setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) - -#' S4 class that represents an IsotonicRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel -#' @export -#' @note IsotonicRegressionModel since 2.1.0 -setClass("IsotonicRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a GaussianMixtureModel -#' -#' @param jobj a Java object reference to the backing Scala GaussianMixtureModel -#' @export -#' @note GaussianMixtureModel since 2.1.0 -setClass("GaussianMixtureModel", representation(jobj = "jobj")) - -#' S4 class that represents an ALSModel -#' -#' @param jobj a Java object reference to the backing Scala ALSWrapper -#' @export -#' @note ALSModel since 2.1.0 -setClass("ALSModel", representation(jobj = "jobj")) - -#' S4 class that represents an KSTest -#' -#' @param jobj a Java object reference to the backing Scala KSTestWrapper -#' @export -#' @note KSTest since 2.1.0 -setClass("KSTest", representation(jobj = "jobj")) - -#' S4 class that represents an LogisticRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala LogisticRegressionModel -#' @export -#' @note LogisticRegressionModel since 2.1.0 -setClass("LogisticRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a RandomForestRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala RandomForestRegressionModel -#' @export -#' @note RandomForestRegressionModel since 2.1.0 -setClass("RandomForestRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a RandomForestClassificationModel -#' -#' @param jobj a Java object reference to the backing Scala RandomForestClassificationModel -#' @export -#' @note RandomForestClassificationModel since 2.1.0 -setClass("RandomForestClassificationModel", representation(jobj = "jobj")) - -#' S4 class that represents a GBTRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala GBTRegressionModel -#' @export -#' @note GBTRegressionModel since 2.1.0 -setClass("GBTRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a GBTClassificationModel -#' -#' @param jobj a Java object reference to the backing Scala GBTClassificationModel -#' @export -#' @note GBTClassificationModel since 2.1.0 -setClass("GBTClassificationModel", representation(jobj = "jobj")) - -#' Saves the MLlib model to the input path -#' -#' Saves the MLlib model to the input path. For more information, see the specific -#' MLlib model below. -#' @rdname write.ml -#' @name write.ml -#' @export -#' @seealso \link{spark.glm}, \link{glm}, -#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg}, -#' @seealso \link{spark.kmeans}, -#' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, -#' @seealso \link{spark.randomForest}, \link{spark.survreg}, -#' @seealso \link{read.ml} -NULL - -#' Makes predictions from a MLlib model -#' -#' Makes predictions from a MLlib model. For more information, see the specific -#' MLlib model below. -#' @rdname predict -#' @name predict -#' @export -#' @seealso \link{spark.glm}, \link{glm}, -#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg}, -#' @seealso \link{spark.kmeans}, -#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, -#' @seealso \link{spark.randomForest}, \link{spark.survreg} -NULL - -write_internal <- function(object, path, overwrite = FALSE) { - writer <- callJMethod(object@jobj, "write") - if (overwrite) { - writer <- callJMethod(writer, "overwrite") - } - invisible(callJMethod(writer, "save", path)) -} - -predict_internal <- function(object, newData) { - dataFrame(callJMethod(object@jobj, "transform", newData@sdf)) -} - -#' Generalized Linear Models -#' -#' Fits generalized linear model against a Spark DataFrame. -#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make -#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param family a description of the error distribution and link function to be used in the model. -#' This can be a character string naming a family function, a family function or -#' the result of a call to a family function. Refer R family at -#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. -#' @param tol positive convergence tolerance of iterations. -#' @param maxIter integer giving the maximal number of IRLS iterations. -#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance -#' weights as 1.0. -#' @param regParam regularization parameter for L2 regularization. -#' @param ... additional arguments passed to the method. -#' @aliases spark.glm,SparkDataFrame,formula-method -#' @return \code{spark.glm} returns a fitted generalized linear model. -#' @rdname spark.glm -#' @name spark.glm -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") -#' summary(model) -#' -#' # fitted values on training data -#' fitted <- predict(model, df) -#' head(select(fitted, "Sepal_Length", "prediction")) -#' -#' # save fitted model to input path -#' path <- "path/to/model" -#' write.ml(model, path) -#' -#' # can also read back the saved model and print -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.glm since 2.0.0 -#' @seealso \link{glm}, \link{read.ml} -setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0) { - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - if (is.null(family$family)) { - print(family) - stop("'family' not recognized") - } - - formula <- paste(deparse(formula), collapse = "") - if (is.null(weightCol)) { - weightCol <- "" - } - - jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", - "fit", formula, data@sdf, family$family, family$link, - tol, as.integer(maxIter), as.character(weightCol), regParam) - new("GeneralizedLinearRegressionModel", jobj = jobj) - }) - -#' Generalized Linear Models (R-compliant) -#' -#' Fits a generalized linear model, similarly to R's glm(). -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param data a SparkDataFrame or R's glm data for training. -#' @param family a description of the error distribution and link function to be used in the model. -#' This can be a character string naming a family function, a family function or -#' the result of a call to a family function. Refer R family at -#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. -#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance -#' weights as 1.0. -#' @param epsilon positive convergence tolerance of iterations. -#' @param maxit integer giving the maximal number of IRLS iterations. -#' @return \code{glm} returns a fitted generalized linear model. -#' @rdname glm -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian") -#' summary(model) -#' } -#' @note glm since 1.5.0 -#' @seealso \link{spark.glm} -setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), - function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) { - spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol) - }) - -# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). - -#' @param object a fitted generalized linear model. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list of components includes at least the \code{coefficients} (coefficients matrix, which includes -#' coefficients, standard error of coefficients, t value and p value), -#' \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC) -#' and \code{iter} (number of iterations IRLS takes). If there are collinear columns in the data, -#' the coefficients matrix only provides coefficients. -#' @rdname spark.glm -#' @export -#' @note summary(GeneralizedLinearRegressionModel) since 2.0.0 -setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), - function(object) { - jobj <- object@jobj - is.loaded <- callJMethod(jobj, "isLoaded") - features <- callJMethod(jobj, "rFeatures") - coefficients <- callJMethod(jobj, "rCoefficients") - dispersion <- callJMethod(jobj, "rDispersion") - null.deviance <- callJMethod(jobj, "rNullDeviance") - deviance <- callJMethod(jobj, "rDeviance") - df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull") - df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom") - aic <- callJMethod(jobj, "rAic") - iter <- callJMethod(jobj, "rNumIterations") - family <- callJMethod(jobj, "rFamily") - deviance.resid <- if (is.loaded) { - NULL - } else { - dataFrame(callJMethod(jobj, "rDevianceResiduals")) - } - # If the underlying WeightedLeastSquares using "normal" solver, we can provide - # coefficients, standard error of coefficients, t value and p value. Otherwise, - # it will be fitted by local "l-bfgs", we can only provide coefficients. - if (length(features) == length(coefficients)) { - coefficients <- matrix(coefficients, ncol = 1) - colnames(coefficients) <- c("Estimate") - rownames(coefficients) <- unlist(features) - } else { - coefficients <- matrix(coefficients, ncol = 4) - colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") - rownames(coefficients) <- unlist(features) - } - ans <- list(deviance.resid = deviance.resid, coefficients = coefficients, - dispersion = dispersion, null.deviance = null.deviance, - deviance = deviance, df.null = df.null, df.residual = df.residual, - aic = aic, iter = iter, family = family, is.loaded = is.loaded) - class(ans) <- "summary.GeneralizedLinearRegressionModel" - ans - }) - -# Prints the summary of GeneralizedLinearRegressionModel - -#' @rdname spark.glm -#' @param x summary object of fitted generalized linear model returned by \code{summary} function. -#' @export -#' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0 -print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { - if (x$is.loaded) { - cat("\nSaved-loaded model does not support output 'Deviance Residuals'.\n") - } else { - x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals", - c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max")) - x$deviance.resid <- zapsmall(x$deviance.resid, 5L) - cat("\nDeviance Residuals: \n") - cat("(Note: These are approximate quantiles with relative error <= 0.01)\n") - print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L) - } - - cat("\nCoefficients:\n") - print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L) - - cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion), - ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"), - format(unlist(x[c("null.deviance", "deviance")]), digits = 5L), - " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"), - 1L, paste, collapse = " "), sep = "") - cat("AIC: ", format(x$aic, digits = 4L), "\n\n", - "Number of Fisher Scoring iterations: ", x$iter, "\n\n", sep = "") - invisible(x) - } - -# Makes predictions from a generalized linear model produced by glm() or spark.glm(), -# similarly to R's predict(). - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named -#' "prediction". -#' @rdname spark.glm -#' @export -#' @note predict(GeneralizedLinearRegressionModel) since 1.5.0 -setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), -# similarly to R package e1071's predict. - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named -#' "prediction". -#' @rdname spark.naiveBayes -#' @export -#' @note predict(NaiveBayesModel) since 2.0.0 -setMethod("predict", signature(object = "NaiveBayesModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes} - -#' @param object a naive Bayes model fitted by \code{spark.naiveBayes}. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes \code{apriori} (the label distribution) and -#' \code{tables} (conditional probabilities given the target label). -#' @rdname spark.naiveBayes -#' @export -#' @note summary(NaiveBayesModel) since 2.0.0 -setMethod("summary", signature(object = "NaiveBayesModel"), - function(object) { - jobj <- object@jobj - features <- callJMethod(jobj, "features") - labels <- callJMethod(jobj, "labels") - apriori <- callJMethod(jobj, "apriori") - apriori <- t(as.matrix(unlist(apriori))) - colnames(apriori) <- unlist(labels) - tables <- callJMethod(jobj, "tables") - tables <- matrix(tables, nrow = length(labels)) - rownames(tables) <- unlist(labels) - colnames(tables) <- unlist(features) - list(apriori = apriori, tables = tables) - }) - -# Returns posterior probabilities from a Latent Dirichlet Allocation model produced by spark.lda() - -#' @param newData A SparkDataFrame for testing. -#' @return \code{spark.posterior} returns a SparkDataFrame containing posterior probabilities -#' vectors named "topicDistribution". -#' @rdname spark.lda -#' @aliases spark.posterior,LDAModel,SparkDataFrame-method -#' @export -#' @note spark.posterior(LDAModel) since 2.1.0 -setMethod("spark.posterior", signature(object = "LDAModel", newData = "SparkDataFrame"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Returns the summary of a Latent Dirichlet Allocation model produced by \code{spark.lda} - -#' @param object A Latent Dirichlet Allocation model fitted by \code{spark.lda}. -#' @param maxTermsPerTopic Maximum number of terms to collect for each topic. Default value of 10. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes -#' \item{\code{docConcentration}}{concentration parameter commonly named \code{alpha} for -#' the prior placed on documents distributions over topics \code{theta}} -#' \item{\code{topicConcentration}}{concentration parameter commonly named \code{beta} or -#' \code{eta} for the prior placed on topic distributions over terms} -#' \item{\code{logLikelihood}}{log likelihood of the entire corpus} -#' \item{\code{logPerplexity}}{log perplexity} -#' \item{\code{isDistributed}}{TRUE for distributed model while FALSE for local model} -#' \item{\code{vocabSize}}{number of terms in the corpus} -#' \item{\code{topics}}{top 10 terms and their weights of all topics} -#' \item{\code{vocabulary}}{whole terms of the training corpus, NULL if libsvm format file -#' used as training set} -#' @rdname spark.lda -#' @aliases summary,LDAModel-method -#' @export -#' @note summary(LDAModel) since 2.1.0 -setMethod("summary", signature(object = "LDAModel"), - function(object, maxTermsPerTopic) { - maxTermsPerTopic <- as.integer(ifelse(missing(maxTermsPerTopic), 10, maxTermsPerTopic)) - jobj <- object@jobj - docConcentration <- callJMethod(jobj, "docConcentration") - topicConcentration <- callJMethod(jobj, "topicConcentration") - logLikelihood <- callJMethod(jobj, "logLikelihood") - logPerplexity <- callJMethod(jobj, "logPerplexity") - isDistributed <- callJMethod(jobj, "isDistributed") - vocabSize <- callJMethod(jobj, "vocabSize") - topics <- dataFrame(callJMethod(jobj, "topics", maxTermsPerTopic)) - vocabulary <- callJMethod(jobj, "vocabulary") - list(docConcentration = unlist(docConcentration), - topicConcentration = topicConcentration, - logLikelihood = logLikelihood, logPerplexity = logPerplexity, - isDistributed = isDistributed, vocabSize = vocabSize, - topics = topics, vocabulary = unlist(vocabulary)) - }) - -# Returns the log perplexity of a Latent Dirichlet Allocation model produced by \code{spark.lda} - -#' @return \code{spark.perplexity} returns the log perplexity of given SparkDataFrame, or the log -#' perplexity of the training data if missing argument "data". -#' @rdname spark.lda -#' @aliases spark.perplexity,LDAModel-method -#' @export -#' @note spark.perplexity(LDAModel) since 2.1.0 -setMethod("spark.perplexity", signature(object = "LDAModel", data = "SparkDataFrame"), - function(object, data) { - ifelse(missing(data), callJMethod(object@jobj, "logPerplexity"), - callJMethod(object@jobj, "computeLogPerplexity", data@sdf)) - }) - -# Saves the Latent Dirichlet Allocation model to the input path. - -#' @param path The directory where the model is saved. -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.lda -#' @aliases write.ml,LDAModel,character-method -#' @export -#' @seealso \link{read.ml} -#' @note write.ml(LDAModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "LDAModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -#' Isotonic Regression Model -#' -#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to R's isoreg(). -#' Users can print, make predictions on the produced model and save the model to the input path. -#' -#' @param data SparkDataFrame for training. -#' @param formula A symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param isotonic Whether the output sequence should be isotonic/increasing (TRUE) or -#' antitonic/decreasing (FALSE). -#' @param featureIndex The index of the feature if \code{featuresCol} is a vector column -#' (default: 0), no effect otherwise. -#' @param weightCol The weight column name. -#' @param ... additional arguments passed to the method. -#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model. -#' @rdname spark.isoreg -#' @aliases spark.isoreg,SparkDataFrame,formula-method -#' @name spark.isoreg -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' data <- list(list(7.0, 0.0), list(5.0, 1.0), list(3.0, 2.0), -#' list(5.0, 3.0), list(1.0, 4.0)) -#' df <- createDataFrame(data, c("label", "feature")) -#' model <- spark.isoreg(df, label ~ feature, isotonic = FALSE) -#' # return model boundaries and prediction as lists -#' result <- summary(model, df) -#' # prediction based on fitted model -#' predict_data <- list(list(-2.0), list(-1.0), list(0.5), -#' list(0.75), list(1.0), list(2.0), list(9.0)) -#' predict_df <- createDataFrame(predict_data, c("feature")) -#' # get prediction column -#' predict_result <- collect(select(predict(model, predict_df), "prediction")) -#' -#' # save fitted model to input path -#' path <- "path/to/model" -#' write.ml(model, path) -#' -#' # can also read back the saved model and print -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.isoreg since 2.1.0 -setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) { - formula <- paste(deparse(formula), collapse = "") - - if (is.null(weightCol)) { - weightCol <- "" - } - - jobj <- callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit", - data@sdf, formula, as.logical(isotonic), as.integer(featureIndex), - as.character(weightCol)) - new("IsotonicRegressionModel", jobj = jobj) - }) - -# Predicted values based on an isotonicRegression model - -#' @param object a fitted IsotonicRegressionModel. -#' @param newData SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted values. -#' @rdname spark.isoreg -#' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method -#' @export -#' @note predict(IsotonicRegressionModel) since 2.1.0 -setMethod("predict", signature(object = "IsotonicRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Get the summary of an IsotonicRegressionModel model - -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes model's \code{boundaries} (boundaries in increasing order) -#' and \code{predictions} (predictions associated with the boundaries at the same index). -#' @rdname spark.isoreg -#' @aliases summary,IsotonicRegressionModel-method -#' @export -#' @note summary(IsotonicRegressionModel) since 2.1.0 -setMethod("summary", signature(object = "IsotonicRegressionModel"), - function(object) { - jobj <- object@jobj - boundaries <- callJMethod(jobj, "boundaries") - predictions <- callJMethod(jobj, "predictions") - list(boundaries = boundaries, predictions = predictions) - }) - -#' K-Means Clustering Model -#' -#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). -#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make -#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' Note that the response variable of formula is empty in spark.kmeans. -#' @param k number of centers. -#' @param maxIter maximum iteration number. -#' @param initMode the initialization algorithm choosen to fit the model. -#' @param ... additional argument(s) passed to the method. -#' @return \code{spark.kmeans} returns a fitted k-means model. -#' @rdname spark.kmeans -#' @aliases spark.kmeans,SparkDataFrame,formula-method -#' @name spark.kmeans -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random") -#' summary(model) -#' -#' # fitted values on training data -#' fitted <- predict(model, df) -#' head(select(fitted, "Sepal_Length", "prediction")) -#' -#' # save fitted model to input path -#' path <- "path/to/model" -#' write.ml(model, path) -#' -#' # can also read back the saved model and print -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.kmeans since 2.0.0 -#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} -setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) { - formula <- paste(deparse(formula), collapse = "") - initMode <- match.arg(initMode) - jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula, - as.integer(k), as.integer(maxIter), initMode) - new("KMeansModel", jobj = jobj) - }) - -#' Get fitted result from a k-means model -#' -#' Get fitted result from a k-means model, similarly to R's fitted(). -#' Note: A saved-loaded model does not support this method. -#' -#' @param object a fitted k-means model. -#' @param method type of fitted results, \code{"centers"} for cluster centers -#' or \code{"classes"} for assigned classes. -#' @param ... additional argument(s) passed to the method. -#' @return \code{fitted} returns a SparkDataFrame containing fitted values. -#' @rdname fitted -#' @export -#' @examples -#' \dontrun{ -#' model <- spark.kmeans(trainingData, ~ ., 2) -#' fitted.model <- fitted(model) -#' showDF(fitted.model) -#'} -#' @note fitted since 2.0.0 -setMethod("fitted", signature(object = "KMeansModel"), - function(object, method = c("centers", "classes")) { - method <- match.arg(method) - jobj <- object@jobj - is.loaded <- callJMethod(jobj, "isLoaded") - if (is.loaded) { - stop("Saved-loaded k-means model does not support 'fitted' method") - } else { - dataFrame(callJMethod(jobj, "fitted", method)) - } - }) - -# Get the summary of a k-means model - -#' @param object a fitted k-means model. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes the model's \code{k} (number of cluster centers), -#' \code{coefficients} (model cluster centers), -#' \code{size} (number of data points in each cluster), and \code{cluster} -#' (cluster centers of the transformed data). -#' @rdname spark.kmeans -#' @export -#' @note summary(KMeansModel) since 2.0.0 -setMethod("summary", signature(object = "KMeansModel"), - function(object) { - jobj <- object@jobj - is.loaded <- callJMethod(jobj, "isLoaded") - features <- callJMethod(jobj, "features") - coefficients <- callJMethod(jobj, "coefficients") - k <- callJMethod(jobj, "k") - size <- callJMethod(jobj, "size") - coefficients <- t(matrix(coefficients, ncol = k)) - colnames(coefficients) <- unlist(features) - rownames(coefficients) <- 1:k - cluster <- if (is.loaded) { - NULL - } else { - dataFrame(callJMethod(jobj, "cluster")) - } - list(k = k, coefficients = coefficients, size = size, - cluster = cluster, is.loaded = is.loaded) - }) - -# Predicted values based on a k-means model - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns the predicted values based on a k-means model. -#' @rdname spark.kmeans -#' @export -#' @note predict(KMeansModel) since 2.0.0 -setMethod("predict", signature(object = "KMeansModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -#' Logistic Regression Model -#' -#' Fits an logistic regression model against a Spark DataFrame. It supports "binomial": Binary logistic regression -#' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet. -#' Users can print, make predictions on the produced model and save the model to the input path. -#' -#' @param data SparkDataFrame for training. -#' @param formula A symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param regParam the regularization parameter. -#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty. -#' For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination -#' of L1 and L2. Default is 0.0 which is an L2 penalty. -#' @param maxIter maximum iteration number. -#' @param tol convergence tolerance of iterations. -#' @param family the name of family which is a description of the label distribution to be used in the model. -#' Supported options: -#' \itemize{ -#' \item{"auto": Automatically select the family based on the number of classes: -#' If number of classes == 1 || number of classes == 2, set to "binomial". -#' Else, set to "multinomial".} -#' \item{"binomial": Binary logistic regression with pivoting.} -#' \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.} -#' } -#' @param standardization whether to standardize the training features before fitting the model. The coefficients -#' of models will be always returned on the original scale, so it will be transparent for -#' users. Note that with/without standardization, the models should be always converged -#' to the same solution when no regularization is applied. Default is TRUE, same as glmnet. -#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1 -#' is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0 -#' more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with -#' threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of -#' predicting each class. Array must have length equal to the number of classes, with values > 0, -#' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p -#' is the original probability of that class and t is the class's threshold. -#' @param weightCol The weight column name. -#' @param ... additional arguments passed to the method. -#' @return \code{spark.logit} returns a fitted logistic regression model. -#' @rdname spark.logit -#' @aliases spark.logit,SparkDataFrame,formula-method -#' @name spark.logit -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' # binary logistic regression -#' df <- createDataFrame(iris) -#' training <- df[df$Species %in% c("versicolor", "virginica"), ] -#' model <- spark.logit(training, Species ~ ., regParam = 0.5) -#' summary <- summary(model) -#' -#' # fitted values on training data -#' fitted <- predict(model, training) -#' -#' # save fitted model to input path -#' path <- "path/to/model" -#' write.ml(model, path) -#' -#' # can also read back the saved model and predict -#' # Note that summary deos not work on loaded model -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' -#' # multinomial logistic regression -#' -#' df <- createDataFrame(iris) -#' model <- spark.logit(df, Species ~ ., regParam = 0.5) -#' summary <- summary(model) -#' -#' } -#' @note spark.logit since 2.1.0 -setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, - tol = 1E-6, family = "auto", standardization = TRUE, - thresholds = 0.5, weightCol = NULL) { - formula <- paste(deparse(formula), collapse = "") - - if (is.null(weightCol)) { - weightCol <- "" - } - - jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", - data@sdf, formula, as.numeric(regParam), - as.numeric(elasticNetParam), as.integer(maxIter), - as.numeric(tol), as.character(family), - as.logical(standardization), as.array(thresholds), - as.character(weightCol)) - new("LogisticRegressionModel", jobj = jobj) - }) - -# Predicted values based on an LogisticRegressionModel model - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns the predicted values based on an LogisticRegressionModel. -#' @rdname spark.logit -#' @aliases predict,LogisticRegressionModel,SparkDataFrame-method -#' @export -#' @note predict(LogisticRegressionModel) since 2.1.0 -setMethod("predict", signature(object = "LogisticRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Get the summary of an LogisticRegressionModel - -#' @param object an LogisticRegressionModel fitted by \code{spark.logit}. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes \code{coefficients} (coefficients matrix of the fitted model). -#' @rdname spark.logit -#' @aliases summary,LogisticRegressionModel-method -#' @export -#' @note summary(LogisticRegressionModel) since 2.1.0 -setMethod("summary", signature(object = "LogisticRegressionModel"), - function(object) { - jobj <- object@jobj - features <- callJMethod(jobj, "rFeatures") - labels <- callJMethod(jobj, "labels") - coefficients <- callJMethod(jobj, "rCoefficients") - nCol <- length(coefficients) / length(features) - coefficients <- matrix(coefficients, ncol = nCol) - # If nCol == 1, means this is a binomial logistic regression model with pivoting. - # Otherwise, it's a multinomial logistic regression model without pivoting. - if (nCol == 1) { - colnames(coefficients) <- c("Estimate") - } else { - colnames(coefficients) <- unlist(labels) - } - rownames(coefficients) <- unlist(features) - - list(coefficients = coefficients) - }) - -#' Multilayer Perceptron Classification Model -#' -#' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame. -#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make -#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' Only categorical data is supported. -#' For more details, see -#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{ -#' Multilayer Perceptron} -#' -#' @param data a \code{SparkDataFrame} of observations and labels for model fitting. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param blockSize blockSize parameter. -#' @param layers integer vector containing the number of nodes for each layer. -#' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs". -#' @param maxIter maximum iteration number. -#' @param tol convergence tolerance of iterations. -#' @param stepSize stepSize parameter. -#' @param seed seed parameter for weights initialization. -#' @param initialWeights initialWeights parameter for weights initialization, it should be a -#' numeric vector. -#' @param ... additional arguments passed to the method. -#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model. -#' @rdname spark.mlp -#' @aliases spark.mlp,SparkDataFrame,formula-method -#' @name spark.mlp -#' @seealso \link{read.ml} -#' @export -#' @examples -#' \dontrun{ -#' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") -#' -#' # fit a Multilayer Perceptron Classification Model -#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs", -#' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, -#' initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) -#' -#' # get the summary of the model -#' summary(model) -#' -#' # make predictions -#' predictions <- predict(model, df) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.mlp since 2.1.0 -setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, - tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) { - formula <- paste(deparse(formula), collapse = "") - if (is.null(layers)) { - stop ("layers must be a integer vector with length > 1.") - } - layers <- as.integer(na.omit(layers)) - if (length(layers) <= 1) { - stop ("layers must be a integer vector with length > 1.") - } - if (!is.null(seed)) { - seed <- as.character(as.integer(seed)) - } - if (!is.null(initialWeights)) { - initialWeights <- as.array(as.numeric(na.omit(initialWeights))) - } - jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", - "fit", data@sdf, formula, as.integer(blockSize), as.array(layers), - as.character(solver), as.integer(maxIter), as.numeric(tol), - as.numeric(stepSize), seed, initialWeights) - new("MultilayerPerceptronClassificationModel", jobj = jobj) - }) - -# Makes predictions from a model produced by spark.mlp(). - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named -#' "prediction". -#' @rdname spark.mlp -#' @aliases predict,MultilayerPerceptronClassificationModel-method -#' @export -#' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0 -setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp} - -#' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp} -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs} -#' (number of outputs), \code{layers} (array of layer sizes including input -#' and output layers), and \code{weights} (the weights of layers). -#' For \code{weights}, it is a numeric vector with length equal to the expected -#' given the architecture (i.e., for 8-10-2 network, 112 connection weights). -#' @rdname spark.mlp -#' @export -#' @aliases summary,MultilayerPerceptronClassificationModel-method -#' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0 -setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"), - function(object) { - jobj <- object@jobj - layers <- unlist(callJMethod(jobj, "layers")) - numOfInputs <- head(layers, n = 1) - numOfOutputs <- tail(layers, n = 1) - weights <- callJMethod(jobj, "weights") - list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs, - layers = layers, weights = weights) - }) - -#' Naive Bayes Models -#' -#' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame. -#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make -#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' Only categorical data is supported. -#' -#' @param data a \code{SparkDataFrame} of observations and labels for model fitting. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param smoothing smoothing parameter. -#' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}. -#' @return \code{spark.naiveBayes} returns a fitted naive Bayes model. -#' @rdname spark.naiveBayes -#' @aliases spark.naiveBayes,SparkDataFrame,formula-method -#' @name spark.naiveBayes -#' @seealso e1071: \url{https://cran.r-project.org/package=e1071} -#' @export -#' @examples -#' \dontrun{ -#' data <- as.data.frame(UCBAdmissions) -#' df <- createDataFrame(data) -#' -#' # fit a Bernoulli naive Bayes model -#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0) -#' -#' # get the summary of the model -#' summary(model) -#' -#' # make predictions -#' predictions <- predict(model, df) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.naiveBayes since 2.0.0 -setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, smoothing = 1.0) { - formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", - formula, data@sdf, smoothing) - new("NaiveBayesModel", jobj = jobj) - }) - -# Saves the Bernoulli naive Bayes model to the input path. - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.naiveBayes -#' @export -#' @seealso \link{write.ml} -#' @note write.ml(NaiveBayesModel, character) since 2.0.0 -setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Saves the AFT survival regression model to the input path. - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' @rdname spark.survreg -#' @export -#' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0 -#' @seealso \link{write.ml} -setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Saves the generalized linear model to the input path. - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.glm -#' @export -#' @note write.ml(GeneralizedLinearRegressionModel, character) since 2.0.0 -setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Save fitted MLlib model to the input path - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.kmeans -#' @export -#' @note write.ml(KMeansModel, character) since 2.0.0 -setMethod("write.ml", signature(object = "KMeansModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Saves the Multilayer Perceptron Classification Model to the input path. - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.mlp -#' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method -#' @export -#' @seealso \link{write.ml} -#' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel", - path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Save fitted IsotonicRegressionModel to the input path - -#' @param path The directory where the model is saved. -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.isoreg -#' @aliases write.ml,IsotonicRegressionModel,character-method -#' @export -#' @note write.ml(IsotonicRegression, character) since 2.1.0 -setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Save fitted LogisticRegressionModel to the input path - -#' @param path The directory where the model is saved. -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @rdname spark.logit -#' @aliases write.ml,LogisticRegressionModel,character-method -#' @export -#' @note write.ml(LogisticRegression, character) since 2.1.0 -setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Save fitted MLlib model to the input path - -#' @param path the directory where the model is saved. -#' @param overwrite overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @aliases write.ml,GaussianMixtureModel,character-method -#' @rdname spark.gaussianMixture -#' @export -#' @note write.ml(GaussianMixtureModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -#' Load a fitted MLlib model from the input path. -#' -#' @param path path of the model to read. -#' @return A fitted MLlib model. -#' @rdname read.ml -#' @name read.ml -#' @export -#' @seealso \link{write.ml} -#' @examples -#' \dontrun{ -#' path <- "path/to/model" -#' model <- read.ml(path) -#' } -#' @note read.ml since 2.0.0 -read.ml <- function(path) { - path <- suppressWarnings(normalizePath(path)) - jobj <- callJStatic("org.apache.spark.ml.r.RWrappers", "load", path) - if (isInstanceOf(jobj, "org.apache.spark.ml.r.NaiveBayesWrapper")) { - new("NaiveBayesModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper")) { - new("AFTSurvivalRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper")) { - new("GeneralizedLinearRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.KMeansWrapper")) { - new("KMeansModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LDAWrapper")) { - new("LDAModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper")) { - new("MultilayerPerceptronClassificationModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.IsotonicRegressionWrapper")) { - new("IsotonicRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GaussianMixtureWrapper")) { - new("GaussianMixtureModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) { - new("ALSModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LogisticRegressionWrapper")) { - new("LogisticRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestRegressorWrapper")) { - new("RandomForestRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestClassifierWrapper")) { - new("RandomForestClassificationModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTRegressorWrapper")) { - new("GBTRegressionModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTClassifierWrapper")) { - new("GBTClassificationModel", jobj = jobj) - } else { - stop("Unsupported model: ", jobj) - } -} - -#' Accelerated Failure Time (AFT) Survival Regression Model -#' -#' \code{spark.survreg} fits an accelerated failure time (AFT) survival regression model on -#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted AFT model, -#' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to -#' save/load fitted models. -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', ':', '+', and '-'. -#' Note that operator '.' is not supported currently. -#' @return \code{spark.survreg} returns a fitted AFT survival regression model. -#' @rdname spark.survreg -#' @seealso survival: \url{https://cran.r-project.org/package=survival} -#' @export -#' @examples -#' \dontrun{ -#' df <- createDataFrame(ovarian) -#' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx) -#' -#' # get a summary of the model -#' summary(model) -#' -#' # make predictions -#' predicted <- predict(model, df) -#' showDF(predicted) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.survreg since 2.0.0 -setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula) { - formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper", - "fit", formula, data@sdf) - new("AFTSurvivalRegressionModel", jobj = jobj) - }) - -#' Latent Dirichlet Allocation -#' -#' \code{spark.lda} fits a Latent Dirichlet Allocation model on a SparkDataFrame. Users can call -#' \code{summary} to get a summary of the fitted LDA model, \code{spark.posterior} to compute -#' posterior probabilities on new data, \code{spark.perplexity} to compute log perplexity on new -#' data and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' -#' @param data A SparkDataFrame for training. -#' @param features Features column name. Either libSVM-format column or character-format column is -#' valid. -#' @param k Number of topics. -#' @param maxIter Maximum iterations. -#' @param optimizer Optimizer to train an LDA model, "online" or "em", default is "online". -#' @param subsamplingRate (For online optimizer) Fraction of the corpus to be sampled and used in -#' each iteration of mini-batch gradient descent, in range (0, 1]. -#' @param topicConcentration concentration parameter (commonly named \code{beta} or \code{eta}) for -#' the prior placed on topic distributions over terms, default -1 to set automatically on the -#' Spark side. Use \code{summary} to retrieve the effective topicConcentration. Only 1-size -#' numeric is accepted. -#' @param docConcentration concentration parameter (commonly named \code{alpha}) for the -#' prior placed on documents distributions over topics (\code{theta}), default -1 to set -#' automatically on the Spark side. Use \code{summary} to retrieve the effective -#' docConcentration. Only 1-size or \code{k}-size numeric is accepted. -#' @param customizedStopWords stopwords that need to be removed from the given corpus. Ignore the -#' parameter if libSVM-format column is used as the features column. -#' @param maxVocabSize maximum vocabulary size, default 1 << 18 -#' @param ... additional argument(s) passed to the method. -#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model. -#' @rdname spark.lda -#' @aliases spark.lda,SparkDataFrame-method -#' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels} -#' @export -#' @examples -#' \dontrun{ -#' # nolint start -#' # An example "path/to/file" can be -#' # paste0(Sys.getenv("SPARK_HOME"), "/data/mllib/sample_lda_libsvm_data.txt") -#' # nolint end -#' text <- read.df("path/to/file", source = "libsvm") -#' model <- spark.lda(data = text, optimizer = "em") -#' -#' # get a summary of the model -#' summary(model) -#' -#' # compute posterior probabilities -#' posterior <- spark.posterior(model, text) -#' showDF(posterior) -#' -#' # compute perplexity -#' perplexity <- spark.perplexity(model, text) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.lda since 2.1.0 -setMethod("spark.lda", signature(data = "SparkDataFrame"), - function(data, features = "features", k = 10, maxIter = 20, optimizer = c("online", "em"), - subsamplingRate = 0.05, topicConcentration = -1, docConcentration = -1, - customizedStopWords = "", maxVocabSize = bitwShiftL(1, 18)) { - optimizer <- match.arg(optimizer) - jobj <- callJStatic("org.apache.spark.ml.r.LDAWrapper", "fit", data@sdf, features, - as.integer(k), as.integer(maxIter), optimizer, - as.numeric(subsamplingRate), topicConcentration, - as.array(docConcentration), as.array(customizedStopWords), - maxVocabSize) - new("LDAModel", jobj = jobj) - }) - -# Returns a summary of the AFT survival regression model produced by spark.survreg, -# similarly to R's summary(). - -#' @param object a fitted AFT survival regression model. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes the model's \code{coefficients} (features, coefficients, -#' intercept and log(scale)). -#' @rdname spark.survreg -#' @export -#' @note summary(AFTSurvivalRegressionModel) since 2.0.0 -setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), - function(object) { - jobj <- object@jobj - features <- callJMethod(jobj, "rFeatures") - coefficients <- callJMethod(jobj, "rCoefficients") - coefficients <- as.matrix(unlist(coefficients)) - colnames(coefficients) <- c("Value") - rownames(coefficients) <- unlist(features) - list(coefficients = coefficients) - }) - -# Makes predictions from an AFT survival regression model or a model produced by -# spark.survreg, similarly to R package survival's predict. - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted values -#' on the original scale of the data (mean predicted value at scale = 1.0). -#' @rdname spark.survreg -#' @export -#' @note predict(AFTSurvivalRegressionModel) since 2.0.0 -setMethod("predict", signature(object = "AFTSurvivalRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -#' Multivariate Gaussian Mixture Model (GMM) -#' -#' Fits multivariate gaussian mixture model against a Spark DataFrame, similarly to R's -#' mvnormalmixEM(). Users can call \code{summary} to print a summary of the fitted model, -#' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} -#' to save/load fitted models. -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' Note that the response variable of formula is empty in spark.gaussianMixture. -#' @param k number of independent Gaussians in the mixture model. -#' @param maxIter maximum iteration number. -#' @param tol the convergence tolerance. -#' @param ... additional arguments passed to the method. -#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method -#' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian mixture model. -#' @rdname spark.gaussianMixture -#' @name spark.gaussianMixture -#' @seealso mixtools: \url{https://cran.r-project.org/package=mixtools} -#' @export -#' @examples -#' \dontrun{ -#' sparkR.session() -#' library(mvtnorm) -#' set.seed(100) -#' a <- rmvnorm(4, c(0, 0)) -#' b <- rmvnorm(6, c(3, 4)) -#' data <- rbind(a, b) -#' df <- createDataFrame(as.data.frame(data)) -#' model <- spark.gaussianMixture(df, ~ V1 + V2, k = 2) -#' summary(model) -#' -#' # fitted values on training data -#' fitted <- predict(model, df) -#' head(select(fitted, "V1", "prediction")) -#' -#' # save fitted model to input path -#' path <- "path/to/model" -#' write.ml(model, path) -#' -#' # can also read back the saved model and print -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' } -#' @note spark.gaussianMixture since 2.1.0 -#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} -setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, k = 2, maxIter = 100, tol = 0.01) { - formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.GaussianMixtureWrapper", "fit", data@sdf, - formula, as.integer(k), as.integer(maxIter), as.numeric(tol)) - new("GaussianMixtureModel", jobj = jobj) - }) - -# Get the summary of a multivariate gaussian mixture model - -#' @param object a fitted gaussian mixture model. -#' @return \code{summary} returns summary of the fitted model, which is a list. -#' The list includes the model's \code{lambda} (lambda), \code{mu} (mu), -#' \code{sigma} (sigma), and \code{posterior} (posterior). -#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method -#' @rdname spark.gaussianMixture -#' @export -#' @note summary(GaussianMixtureModel) since 2.1.0 -setMethod("summary", signature(object = "GaussianMixtureModel"), - function(object) { - jobj <- object@jobj - is.loaded <- callJMethod(jobj, "isLoaded") - lambda <- unlist(callJMethod(jobj, "lambda")) - muList <- callJMethod(jobj, "mu") - sigmaList <- callJMethod(jobj, "sigma") - k <- callJMethod(jobj, "k") - dim <- callJMethod(jobj, "dim") - mu <- c() - for (i in 1 : k) { - start <- (i - 1) * dim + 1 - end <- i * dim - mu[[i]] <- unlist(muList[start : end]) - } - sigma <- c() - for (i in 1 : k) { - start <- (i - 1) * dim * dim + 1 - end <- i * dim * dim - sigma[[i]] <- t(matrix(sigmaList[start : end], ncol = dim)) - } - posterior <- if (is.loaded) { - NULL - } else { - dataFrame(callJMethod(jobj, "posterior")) - } - list(lambda = lambda, mu = mu, sigma = sigma, - posterior = posterior, is.loaded = is.loaded) - }) - -# Predicted values based on a gaussian mixture model - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named -#' "prediction". -#' @aliases predict,GaussianMixtureModel,SparkDataFrame-method -#' @rdname spark.gaussianMixture -#' @export -#' @note predict(GaussianMixtureModel) since 2.1.0 -setMethod("predict", signature(object = "GaussianMixtureModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -#' Alternating Least Squares (ALS) for Collaborative Filtering -#' -#' \code{spark.als} learns latent factors in collaborative filtering via alternating least -#' squares. Users can call \code{summary} to obtain fitted latent factors, \code{predict} -#' to make predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. -#' -#' For more details, see -#' \href{http://spark.apache.org/docs/latest/ml-collaborative-filtering.html}{MLlib: -#' Collaborative Filtering}. -#' -#' @param data a SparkDataFrame for training. -#' @param ratingCol column name for ratings. -#' @param userCol column name for user ids. Ids must be (or can be coerced into) integers. -#' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers. -#' @param rank rank of the matrix factorization (> 0). -#' @param regParam regularization parameter (>= 0). -#' @param maxIter maximum number of iterations (>= 0). -#' @param nonnegative logical value indicating whether to apply nonnegativity constraints. -#' @param implicitPrefs logical value indicating whether to use implicit preference. -#' @param alpha alpha parameter in the implicit preference formulation (>= 0). -#' @param seed integer seed for random number generation. -#' @param numUserBlocks number of user blocks used to parallelize computation (> 0). -#' @param numItemBlocks number of item blocks used to parallelize computation (> 0). -#' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1). -#' @param ... additional argument(s) passed to the method. -#' @return \code{spark.als} returns a fitted ALS model. -#' @rdname spark.als -#' @aliases spark.als,SparkDataFrame-method -#' @name spark.als -#' @export -#' @examples -#' \dontrun{ -#' ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), -#' list(2, 1, 1.0), list(2, 2, 5.0)) -#' df <- createDataFrame(ratings, c("user", "item", "rating")) -#' model <- spark.als(df, "rating", "user", "item") -#' -#' # extract latent factors -#' stats <- summary(model) -#' userFactors <- stats$userFactors -#' itemFactors <- stats$itemFactors -#' -#' # make predictions -#' predicted <- predict(model, df) -#' showDF(predicted) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' -#' # set other arguments -#' modelS <- spark.als(df, "rating", "user", "item", rank = 20, -#' regParam = 0.1, nonnegative = TRUE) -#' statsS <- summary(modelS) -#' } -#' @note spark.als since 2.1.0 -setMethod("spark.als", signature(data = "SparkDataFrame"), - function(data, ratingCol = "rating", userCol = "user", itemCol = "item", - rank = 10, regParam = 0.1, maxIter = 10, nonnegative = FALSE, - implicitPrefs = FALSE, alpha = 1.0, numUserBlocks = 10, numItemBlocks = 10, - checkpointInterval = 10, seed = 0) { - - if (!is.numeric(rank) || rank <= 0) { - stop("rank should be a positive number.") - } - if (!is.numeric(regParam) || regParam < 0) { - stop("regParam should be a nonnegative number.") - } - if (!is.numeric(maxIter) || maxIter <= 0) { - stop("maxIter should be a positive number.") - } - - jobj <- callJStatic("org.apache.spark.ml.r.ALSWrapper", - "fit", data@sdf, ratingCol, userCol, itemCol, as.integer(rank), - regParam, as.integer(maxIter), implicitPrefs, alpha, nonnegative, - as.integer(numUserBlocks), as.integer(numItemBlocks), - as.integer(checkpointInterval), as.integer(seed)) - new("ALSModel", jobj = jobj) - }) - -# Returns a summary of the ALS model produced by spark.als. - -#' @param object a fitted ALS model. -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes \code{user} (the names of the user column), -#' \code{item} (the item column), \code{rating} (the rating column), \code{userFactors} -#' (the estimated user factors), \code{itemFactors} (the estimated item factors), -#' and \code{rank} (rank of the matrix factorization model). -#' @rdname spark.als -#' @aliases summary,ALSModel-method -#' @export -#' @note summary(ALSModel) since 2.1.0 -setMethod("summary", signature(object = "ALSModel"), - function(object) { - jobj <- object@jobj - user <- callJMethod(jobj, "userCol") - item <- callJMethod(jobj, "itemCol") - rating <- callJMethod(jobj, "ratingCol") - userFactors <- dataFrame(callJMethod(jobj, "userFactors")) - itemFactors <- dataFrame(callJMethod(jobj, "itemFactors")) - rank <- callJMethod(jobj, "rank") - list(user = user, item = item, rating = rating, userFactors = userFactors, - itemFactors = itemFactors, rank = rank) - }) - - -# Makes predictions from an ALS model or a model produced by spark.als. - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted values. -#' @rdname spark.als -#' @aliases predict,ALSModel-method -#' @export -#' @note predict(ALSModel) since 2.1.0 -setMethod("predict", signature(object = "ALSModel"), - function(object, newData) { - predict_internal(object, newData) - }) - - -# Saves the ALS model to the input path. - -#' @param path the directory where the model is saved. -#' @param overwrite logical value indicating whether to overwrite if the output path -#' already exists. Default is FALSE which means throw exception -#' if the output path exists. -#' -#' @rdname spark.als -#' @aliases write.ml,ALSModel,character-method -#' @export -#' @seealso \link{read.ml} -#' @note write.ml(ALSModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "ALSModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -#' (One-Sample) Kolmogorov-Smirnov Test -#' -#' @description -#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a -#' continuous distribution. -#' -#' By comparing the largest difference between the empirical cumulative -#' distribution of the sample data and the theoretical distribution we can provide a test for the -#' the null hypothesis that the sample data comes from that theoretical distribution. -#' -#' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest} -#' to print out a summary result. -#' -#' @param data a SparkDataFrame of user data. -#' @param testCol column name where the test data is from. It should be a column of double type. -#' @param nullHypothesis name of the theoretical distribution tested against. Currently only -#' \code{"norm"} for normal distribution is supported. -#' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"}, -#' we can provide as a vector the mean and standard deviation of -#' the distribution. If none is provided, then standard normal will be used. -#' If only one is provided, then the standard deviation will be set to be one. -#' @param ... additional argument(s) passed to the method. -#' @return \code{spark.kstest} returns a test result object. -#' @rdname spark.kstest -#' @aliases spark.kstest,SparkDataFrame-method -#' @name spark.kstest -#' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{ -#' MLlib: Hypothesis Testing} -#' @export -#' @examples -#' \dontrun{ -#' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25)) -#' df <- createDataFrame(data) -#' test <- spark.kstest(df, "test", "norm", c(0, 1)) -#' -#' # get a summary of the test result -#' testSummary <- summary(test) -#' testSummary -#' -#' # print out the summary in an organized way -#' print.summary.KSTest(testSummary) -#' } -#' @note spark.kstest since 2.1.0 -setMethod("spark.kstest", signature(data = "SparkDataFrame"), - function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { - tryCatch(match.arg(nullHypothesis), - error = function(e) { - msg <- paste("Distribution", nullHypothesis, "is not supported.") - stop(msg) - }) - if (nullHypothesis == "norm") { - distParams <- as.numeric(distParams) - mu <- ifelse(length(distParams) < 1, 0, distParams[1]) - sigma <- ifelse(length(distParams) < 2, 1, distParams[2]) - jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper", - "test", data@sdf, testCol, nullHypothesis, - as.array(c(mu, sigma))) - new("KSTest", jobj = jobj) - } -}) - -# Get the summary of Kolmogorov-Smirnov (KS) Test. -#' @param object test result object of KSTest by \code{spark.kstest}. -#' @return \code{summary} returns summary information of KSTest object, which is a list. -#' The list includes the \code{p.value} (p-value), \code{statistic} (test statistic -#' computed for the test), \code{nullHypothesis} (the null hypothesis with its -#' parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test). -#' @rdname spark.kstest -#' @aliases summary,KSTest-method -#' @export -#' @note summary(KSTest) since 2.1.0 -setMethod("summary", signature(object = "KSTest"), - function(object) { - jobj <- object@jobj - pValue <- callJMethod(jobj, "pValue") - statistic <- callJMethod(jobj, "statistic") - nullHypothesis <- callJMethod(jobj, "nullHypothesis") - distName <- callJMethod(jobj, "distName") - distParams <- unlist(callJMethod(jobj, "distParams")) - degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom") - - ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis, - nullHypothesis.name = distName, nullHypothesis.parameters = distParams, - degreesOfFreedom = degreesOfFreedom, jobj = jobj) - class(ans) <- "summary.KSTest" - ans - }) - -# Prints the summary of KSTest - -#' @rdname spark.kstest -#' @param x summary object of KSTest returned by \code{summary}. -#' @export -#' @note print.summary.KSTest since 2.1.0 -print.summary.KSTest <- function(x, ...) { - jobj <- x$jobj - summaryStr <- callJMethod(jobj, "summary") - cat(summaryStr, "\n") - invisible(x) -} - -#' Random Forest Model for Regression and Classification -#' -#' \code{spark.randomForest} fits a Random Forest Regression model or Classification model on -#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted Random Forest -#' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to -#' save/load fitted models. -#' For more details, see -#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{ -#' Random Forest Regression} and -#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{ -#' Random Forest Classification} -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', ':', '+', and '-'. -#' @param type type of model, one of "regression" or "classification", to fit -#' @param maxDepth Maximum depth of the tree (>= 0). -#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing -#' how to split on features at each node. More bins give higher granularity. Must be -#' >= 2 and >= number of categories in any categorical feature. -#' @param numTrees Number of trees to train (>= 1). -#' @param impurity Criterion used for information gain calculation. -#' For regression, must be "variance". For classification, must be one of -#' "entropy" and "gini", default is "gini". -#' @param featureSubsetStrategy The number of features to consider for splits at each tree node. -#' Supported options: "auto", "all", "onethird", "sqrt", "log2", (0.0-1.0], [1-n]. -#' @param seed integer seed for random number generation. -#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in -#' range (0, 1]. -#' @param minInstancesPerNode Minimum number of instances each child must have after split. -#' @param minInfoGain Minimum information gain for a split to be considered at a tree node. -#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1). -#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. -#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with -#' nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching -#' can speed up training of deeper trees. Users can set how often should the -#' cache be checkpointed or disable it by setting checkpointInterval. -#' @param ... additional arguments passed to the method. -#' @aliases spark.randomForest,SparkDataFrame,formula-method -#' @return \code{spark.randomForest} returns a fitted Random Forest model. -#' @rdname spark.randomForest -#' @name spark.randomForest -#' @export -#' @examples -#' \dontrun{ -#' # fit a Random Forest Regression Model -#' df <- createDataFrame(longley) -#' model <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16) -#' -#' # get the summary of the model -#' summary(model) -#' -#' # make predictions -#' predictions <- predict(model, df) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' -#' # fit a Random Forest Classification Model -#' df <- createDataFrame(iris) -#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification") -#' } -#' @note spark.randomForest since 2.1.0 -setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, type = c("regression", "classification"), - maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL, - featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0, - minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10, - maxMemoryInMB = 256, cacheNodeIds = FALSE) { - type <- match.arg(type) - formula <- paste(deparse(formula), collapse = "") - if (!is.null(seed)) { - seed <- as.character(as.integer(seed)) - } - switch(type, - regression = { - if (is.null(impurity)) impurity <- "variance" - impurity <- match.arg(impurity, "variance") - jobj <- callJStatic("org.apache.spark.ml.r.RandomForestRegressorWrapper", - "fit", data@sdf, formula, as.integer(maxDepth), - as.integer(maxBins), as.integer(numTrees), - impurity, as.integer(minInstancesPerNode), - as.numeric(minInfoGain), as.integer(checkpointInterval), - as.character(featureSubsetStrategy), seed, - as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) - new("RandomForestRegressionModel", jobj = jobj) - }, - classification = { - if (is.null(impurity)) impurity <- "gini" - impurity <- match.arg(impurity, c("gini", "entropy")) - jobj <- callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper", - "fit", data@sdf, formula, as.integer(maxDepth), - as.integer(maxBins), as.integer(numTrees), - impurity, as.integer(minInstancesPerNode), - as.numeric(minInfoGain), as.integer(checkpointInterval), - as.character(featureSubsetStrategy), seed, - as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) - new("RandomForestClassificationModel", jobj = jobj) - } - ) - }) - -# Makes predictions from a Random Forest Regression model or Classification model - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named -#' "prediction". -#' @rdname spark.randomForest -#' @aliases predict,RandomForestRegressionModel-method -#' @export -#' @note predict(RandomForestRegressionModel) since 2.1.0 -setMethod("predict", signature(object = "RandomForestRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -#' @rdname spark.randomForest -#' @aliases predict,RandomForestClassificationModel-method -#' @export -#' @note predict(RandomForestClassificationModel) since 2.1.0 -setMethod("predict", signature(object = "RandomForestClassificationModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Save the Random Forest Regression or Classification model to the input path. - -#' @param object A fitted Random Forest regression model or classification model. -#' @param path The directory where the model is saved. -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' -#' @aliases write.ml,RandomForestRegressionModel,character-method -#' @rdname spark.randomForest -#' @export -#' @note write.ml(RandomForestRegressionModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "RandomForestRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -#' @aliases write.ml,RandomForestClassificationModel,character-method -#' @rdname spark.randomForest -#' @export -#' @note write.ml(RandomForestClassificationModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "RandomForestClassificationModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Create the summary of a tree ensemble model (eg. Random Forest, GBT) -summary.treeEnsemble <- function(model) { - jobj <- model@jobj - formula <- callJMethod(jobj, "formula") - numFeatures <- callJMethod(jobj, "numFeatures") - features <- callJMethod(jobj, "features") - featureImportances <- callJMethod(callJMethod(jobj, "featureImportances"), "toString") - numTrees <- callJMethod(jobj, "numTrees") - treeWeights <- callJMethod(jobj, "treeWeights") - list(formula = formula, - numFeatures = numFeatures, - features = features, - featureImportances = featureImportances, - numTrees = numTrees, - treeWeights = treeWeights, - jobj = jobj) -} - -# Get the summary of a Random Forest Regression Model - -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list of components includes \code{formula} (formula), -#' \code{numFeatures} (number of features), \code{features} (list of features), -#' \code{featureImportances} (feature importances), \code{numTrees} (number of trees), -#' and \code{treeWeights} (tree weights). -#' @rdname spark.randomForest -#' @aliases summary,RandomForestRegressionModel-method -#' @export -#' @note summary(RandomForestRegressionModel) since 2.1.0 -setMethod("summary", signature(object = "RandomForestRegressionModel"), - function(object) { - ans <- summary.treeEnsemble(object) - class(ans) <- "summary.RandomForestRegressionModel" - ans - }) - -# Get the summary of a Random Forest Classification Model - -#' @rdname spark.randomForest -#' @aliases summary,RandomForestClassificationModel-method -#' @export -#' @note summary(RandomForestClassificationModel) since 2.1.0 -setMethod("summary", signature(object = "RandomForestClassificationModel"), - function(object) { - ans <- summary.treeEnsemble(object) - class(ans) <- "summary.RandomForestClassificationModel" - ans - }) - -# Prints the summary of tree ensemble models (eg. Random Forest, GBT) -print.summary.treeEnsemble <- function(x) { - jobj <- x$jobj - cat("Formula: ", x$formula) - cat("\nNumber of features: ", x$numFeatures) - cat("\nFeatures: ", unlist(x$features)) - cat("\nFeature importances: ", x$featureImportances) - cat("\nNumber of trees: ", x$numTrees) - cat("\nTree weights: ", unlist(x$treeWeights)) - - summaryStr <- callJMethod(jobj, "summary") - cat("\n", summaryStr, "\n") - invisible(x) -} - -# Prints the summary of Random Forest Regression Model - -#' @param x summary object of Random Forest regression model or classification model -#' returned by \code{summary}. -#' @rdname spark.randomForest -#' @export -#' @note print.summary.RandomForestRegressionModel since 2.1.0 -print.summary.RandomForestRegressionModel <- function(x, ...) { - print.summary.treeEnsemble(x) -} - -# Prints the summary of Random Forest Classification Model - -#' @rdname spark.randomForest -#' @export -#' @note print.summary.RandomForestClassificationModel since 2.1.0 -print.summary.RandomForestClassificationModel <- function(x, ...) { - print.summary.treeEnsemble(x) -} - -#' Gradient Boosted Tree Model for Regression and Classification -#' -#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or Classification model on a -#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted -#' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and -#' \code{write.ml}/\code{read.ml} to save/load fitted models. -#' For more details, see -#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{ -#' GBT Regression} and -#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{ -#' GBT Classification} -#' -#' @param data a SparkDataFrame for training. -#' @param formula a symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', ':', '+', and '-'. -#' @param type type of model, one of "regression" or "classification", to fit -#' @param maxDepth Maximum depth of the tree (>= 0). -#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing -#' how to split on features at each node. More bins give higher granularity. Must be -#' >= 2 and >= number of categories in any categorical feature. -#' @param maxIter Param for maximum number of iterations (>= 0). -#' @param stepSize Param for Step size to be used for each iteration of optimization. -#' @param lossType Loss function which GBT tries to minimize. -#' For classification, must be "logistic". For regression, must be one of -#' "squared" (L2) and "absolute" (L1), default is "squared". -#' @param seed integer seed for random number generation. -#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in -#' range (0, 1]. -#' @param minInstancesPerNode Minimum number of instances each child must have after split. If a -#' split causes the left or right child to have fewer than -#' minInstancesPerNode, the split will be discarded as invalid. Should be -#' >= 1. -#' @param minInfoGain Minimum information gain for a split to be considered at a tree node. -#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1). -#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. -#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with -#' nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching -#' can speed up training of deeper trees. Users can set how often should the -#' cache be checkpointed or disable it by setting checkpointInterval. -#' @param ... additional arguments passed to the method. -#' @aliases spark.gbt,SparkDataFrame,formula-method -#' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model. -#' @rdname spark.gbt -#' @name spark.gbt -#' @export -#' @examples -#' \dontrun{ -#' # fit a Gradient Boosted Tree Regression Model -#' df <- createDataFrame(longley) -#' model <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16) -#' -#' # get the summary of the model -#' summary(model) -#' -#' # make predictions -#' predictions <- predict(model, df) -#' -#' # save and load the model -#' path <- "path/to/model" -#' write.ml(model, path) -#' savedModel <- read.ml(path) -#' summary(savedModel) -#' -#' # fit a Gradient Boosted Tree Classification Model -#' # label must be binary - Only binary classification is supported for GBT. -#' df <- createDataFrame(iris[iris$Species != "virginica", ]) -#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification") -#' -#' # numeric label is also supported -#' iris2 <- iris[iris$Species != "virginica", ] -#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1) -#' df <- createDataFrame(iris2) -#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification") -#' } -#' @note spark.gbt since 2.1.0 -setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, type = c("regression", "classification"), - maxDepth = 5, maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL, - seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0, - checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE) { - type <- match.arg(type) - formula <- paste(deparse(formula), collapse = "") - if (!is.null(seed)) { - seed <- as.character(as.integer(seed)) - } - switch(type, - regression = { - if (is.null(lossType)) lossType <- "squared" - lossType <- match.arg(lossType, c("squared", "absolute")) - jobj <- callJStatic("org.apache.spark.ml.r.GBTRegressorWrapper", - "fit", data@sdf, formula, as.integer(maxDepth), - as.integer(maxBins), as.integer(maxIter), - as.numeric(stepSize), as.integer(minInstancesPerNode), - as.numeric(minInfoGain), as.integer(checkpointInterval), - lossType, seed, as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) - new("GBTRegressionModel", jobj = jobj) - }, - classification = { - if (is.null(lossType)) lossType <- "logistic" - lossType <- match.arg(lossType, "logistic") - jobj <- callJStatic("org.apache.spark.ml.r.GBTClassifierWrapper", - "fit", data@sdf, formula, as.integer(maxDepth), - as.integer(maxBins), as.integer(maxIter), - as.numeric(stepSize), as.integer(minInstancesPerNode), - as.numeric(minInfoGain), as.integer(checkpointInterval), - lossType, seed, as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) - new("GBTClassificationModel", jobj = jobj) - } - ) - }) - -# Makes predictions from a Gradient Boosted Tree Regression model or Classification model - -#' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named -#' "prediction". -#' @rdname spark.gbt -#' @aliases predict,GBTRegressionModel-method -#' @export -#' @note predict(GBTRegressionModel) since 2.1.0 -setMethod("predict", signature(object = "GBTRegressionModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -#' @rdname spark.gbt -#' @aliases predict,GBTClassificationModel-method -#' @export -#' @note predict(GBTClassificationModel) since 2.1.0 -setMethod("predict", signature(object = "GBTClassificationModel"), - function(object, newData) { - predict_internal(object, newData) - }) - -# Save the Gradient Boosted Tree Regression or Classification model to the input path. - -#' @param object A fitted Gradient Boosted Tree regression model or classification model. -#' @param path The directory where the model is saved. -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE -#' which means throw exception if the output path exists. -#' @aliases write.ml,GBTRegressionModel,character-method -#' @rdname spark.gbt -#' @export -#' @note write.ml(GBTRegressionModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "GBTRegressionModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -#' @aliases write.ml,GBTClassificationModel,character-method -#' @rdname spark.gbt -#' @export -#' @note write.ml(GBTClassificationModel, character) since 2.1.0 -setMethod("write.ml", signature(object = "GBTClassificationModel", path = "character"), - function(object, path, overwrite = FALSE) { - write_internal(object, path, overwrite) - }) - -# Get the summary of a Gradient Boosted Tree Regression Model - -#' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list of components includes \code{formula} (formula), -#' \code{numFeatures} (number of features), \code{features} (list of features), -#' \code{featureImportances} (feature importances), \code{numTrees} (number of trees), -#' and \code{treeWeights} (tree weights). -#' @rdname spark.gbt -#' @aliases summary,GBTRegressionModel-method -#' @export -#' @note summary(GBTRegressionModel) since 2.1.0 -setMethod("summary", signature(object = "GBTRegressionModel"), - function(object) { - ans <- summary.treeEnsemble(object) - class(ans) <- "summary.GBTRegressionModel" - ans - }) - -# Get the summary of a Gradient Boosted Tree Classification Model - -#' @rdname spark.gbt -#' @aliases summary,GBTClassificationModel-method -#' @export -#' @note summary(GBTClassificationModel) since 2.1.0 -setMethod("summary", signature(object = "GBTClassificationModel"), - function(object) { - ans <- summary.treeEnsemble(object) - class(ans) <- "summary.GBTClassificationModel" - ans - }) - -# Prints the summary of Gradient Boosted Tree Regression Model - -#' @param x summary object of Gradient Boosted Tree regression model or classification model -#' returned by \code{summary}. -#' @rdname spark.gbt -#' @export -#' @note print.summary.GBTRegressionModel since 2.1.0 -print.summary.GBTRegressionModel <- function(x, ...) { - print.summary.treeEnsemble(x) -} - -# Prints the summary of Gradient Boosted Tree Classification Model - -#' @rdname spark.gbt -#' @export -#' @note print.summary.GBTClassificationModel since 2.1.0 -print.summary.GBTClassificationModel <- function(x, ...) { - print.summary.treeEnsemble(x) -} diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R new file mode 100644 index 0000000000000..8da84499dfc09 --- /dev/null +++ b/R/pkg/R/mllib_classification.R @@ -0,0 +1,417 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_regression.R: Provides methods for MLlib classification algorithms +# (except for tree-based algorithms) integration + +#' S4 class that represents an LogisticRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala LogisticRegressionModel +#' @export +#' @note LogisticRegressionModel since 2.1.0 +setClass("LogisticRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a MultilayerPerceptronClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper +#' @export +#' @note MultilayerPerceptronClassificationModel since 2.1.0 +setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) + +#' S4 class that represents a NaiveBayesModel +#' +#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper +#' @export +#' @note NaiveBayesModel since 2.0.0 +setClass("NaiveBayesModel", representation(jobj = "jobj")) + +#' Logistic Regression Model +#' +#' Fits an logistic regression model against a Spark DataFrame. It supports "binomial": Binary logistic regression +#' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet. +#' Users can print, make predictions on the produced model and save the model to the input path. +#' +#' @param data SparkDataFrame for training. +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param regParam the regularization parameter. +#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty. +#' For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination +#' of L1 and L2. Default is 0.0 which is an L2 penalty. +#' @param maxIter maximum iteration number. +#' @param tol convergence tolerance of iterations. +#' @param family the name of family which is a description of the label distribution to be used in the model. +#' Supported options: +#' \itemize{ +#' \item{"auto": Automatically select the family based on the number of classes: +#' If number of classes == 1 || number of classes == 2, set to "binomial". +#' Else, set to "multinomial".} +#' \item{"binomial": Binary logistic regression with pivoting.} +#' \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.} +#' } +#' @param standardization whether to standardize the training features before fitting the model. The coefficients +#' of models will be always returned on the original scale, so it will be transparent for +#' users. Note that with/without standardization, the models should be always converged +#' to the same solution when no regularization is applied. Default is TRUE, same as glmnet. +#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1 +#' is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0 +#' more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with +#' threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of +#' predicting each class. Array must have length equal to the number of classes, with values > 0, +#' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p +#' is the original probability of that class and t is the class's threshold. +#' @param weightCol The weight column name. +#' @param ... additional arguments passed to the method. +#' @return \code{spark.logit} returns a fitted logistic regression model. +#' @rdname spark.logit +#' @aliases spark.logit,SparkDataFrame,formula-method +#' @name spark.logit +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' # binary logistic regression +#' df <- createDataFrame(iris) +#' training <- df[df$Species %in% c("versicolor", "virginica"), ] +#' model <- spark.logit(training, Species ~ ., regParam = 0.5) +#' summary <- summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, training) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and predict +#' # Note that summary deos not work on loaded model +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' +#' # multinomial logistic regression +#' +#' df <- createDataFrame(iris) +#' model <- spark.logit(df, Species ~ ., regParam = 0.5) +#' summary <- summary(model) +#' +#' } +#' @note spark.logit since 2.1.0 +setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, + tol = 1E-6, family = "auto", standardization = TRUE, + thresholds = 0.5, weightCol = NULL) { + formula <- paste(deparse(formula), collapse = "") + + if (is.null(weightCol)) { + weightCol <- "" + } + + jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", + data@sdf, formula, as.numeric(regParam), + as.numeric(elasticNetParam), as.integer(maxIter), + as.numeric(tol), as.character(family), + as.logical(standardization), as.array(thresholds), + as.character(weightCol)) + new("LogisticRegressionModel", jobj = jobj) + }) + +# Get the summary of an LogisticRegressionModel + +#' @param object an LogisticRegressionModel fitted by \code{spark.logit}. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes \code{coefficients} (coefficients matrix of the fitted model). +#' @rdname spark.logit +#' @aliases summary,LogisticRegressionModel-method +#' @export +#' @note summary(LogisticRegressionModel) since 2.1.0 +setMethod("summary", signature(object = "LogisticRegressionModel"), + function(object) { + jobj <- object@jobj + features <- callJMethod(jobj, "rFeatures") + labels <- callJMethod(jobj, "labels") + coefficients <- callJMethod(jobj, "rCoefficients") + nCol <- length(coefficients) / length(features) + coefficients <- matrix(coefficients, ncol = nCol) + # If nCol == 1, means this is a binomial logistic regression model with pivoting. + # Otherwise, it's a multinomial logistic regression model without pivoting. + if (nCol == 1) { + colnames(coefficients) <- c("Estimate") + } else { + colnames(coefficients) <- unlist(labels) + } + rownames(coefficients) <- unlist(features) + + list(coefficients = coefficients) + }) + +# Predicted values based on an LogisticRegressionModel model + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns the predicted values based on an LogisticRegressionModel. +#' @rdname spark.logit +#' @aliases predict,LogisticRegressionModel,SparkDataFrame-method +#' @export +#' @note predict(LogisticRegressionModel) since 2.1.0 +setMethod("predict", signature(object = "LogisticRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Save fitted LogisticRegressionModel to the input path + +#' @param path The directory where the model is saved. +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.logit +#' @aliases write.ml,LogisticRegressionModel,character-method +#' @export +#' @note write.ml(LogisticRegression, character) since 2.1.0 +setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Multilayer Perceptron Classification Model +#' +#' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' Only categorical data is supported. +#' For more details, see +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{ +#' Multilayer Perceptron} +#' +#' @param data a \code{SparkDataFrame} of observations and labels for model fitting. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param blockSize blockSize parameter. +#' @param layers integer vector containing the number of nodes for each layer. +#' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs". +#' @param maxIter maximum iteration number. +#' @param tol convergence tolerance of iterations. +#' @param stepSize stepSize parameter. +#' @param seed seed parameter for weights initialization. +#' @param initialWeights initialWeights parameter for weights initialization, it should be a +#' numeric vector. +#' @param ... additional arguments passed to the method. +#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model. +#' @rdname spark.mlp +#' @aliases spark.mlp,SparkDataFrame,formula-method +#' @name spark.mlp +#' @seealso \link{read.ml} +#' @export +#' @examples +#' \dontrun{ +#' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") +#' +#' # fit a Multilayer Perceptron Classification Model +#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs", +#' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, +#' initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) +#' +#' # get the summary of the model +#' summary(model) +#' +#' # make predictions +#' predictions <- predict(model, df) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.mlp since 2.1.0 +setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, + tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) { + formula <- paste(deparse(formula), collapse = "") + if (is.null(layers)) { + stop ("layers must be a integer vector with length > 1.") + } + layers <- as.integer(na.omit(layers)) + if (length(layers) <= 1) { + stop ("layers must be a integer vector with length > 1.") + } + if (!is.null(seed)) { + seed <- as.character(as.integer(seed)) + } + if (!is.null(initialWeights)) { + initialWeights <- as.array(as.numeric(na.omit(initialWeights))) + } + jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", + "fit", data@sdf, formula, as.integer(blockSize), as.array(layers), + as.character(solver), as.integer(maxIter), as.numeric(tol), + as.numeric(stepSize), seed, initialWeights) + new("MultilayerPerceptronClassificationModel", jobj = jobj) + }) + +# Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp} + +#' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp} +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs} +#' (number of outputs), \code{layers} (array of layer sizes including input +#' and output layers), and \code{weights} (the weights of layers). +#' For \code{weights}, it is a numeric vector with length equal to the expected +#' given the architecture (i.e., for 8-10-2 network, 112 connection weights). +#' @rdname spark.mlp +#' @export +#' @aliases summary,MultilayerPerceptronClassificationModel-method +#' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0 +setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"), + function(object) { + jobj <- object@jobj + layers <- unlist(callJMethod(jobj, "layers")) + numOfInputs <- head(layers, n = 1) + numOfOutputs <- tail(layers, n = 1) + weights <- callJMethod(jobj, "weights") + list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs, + layers = layers, weights = weights) + }) + +# Makes predictions from a model produced by spark.mlp(). + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction". +#' @rdname spark.mlp +#' @aliases predict,MultilayerPerceptronClassificationModel-method +#' @export +#' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0 +setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the Multilayer Perceptron Classification Model to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.mlp +#' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method +#' @export +#' @seealso \link{write.ml} +#' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel", + path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Naive Bayes Models +#' +#' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' Only categorical data is supported. +#' +#' @param data a \code{SparkDataFrame} of observations and labels for model fitting. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param smoothing smoothing parameter. +#' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}. +#' @return \code{spark.naiveBayes} returns a fitted naive Bayes model. +#' @rdname spark.naiveBayes +#' @aliases spark.naiveBayes,SparkDataFrame,formula-method +#' @name spark.naiveBayes +#' @seealso e1071: \url{https://cran.r-project.org/package=e1071} +#' @export +#' @examples +#' \dontrun{ +#' data <- as.data.frame(UCBAdmissions) +#' df <- createDataFrame(data) +#' +#' # fit a Bernoulli naive Bayes model +#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0) +#' +#' # get the summary of the model +#' summary(model) +#' +#' # make predictions +#' predictions <- predict(model, df) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.naiveBayes since 2.0.0 +setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, smoothing = 1.0) { + formula <- paste(deparse(formula), collapse = "") + jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", + formula, data@sdf, smoothing) + new("NaiveBayesModel", jobj = jobj) + }) + +# Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes} + +#' @param object a naive Bayes model fitted by \code{spark.naiveBayes}. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes \code{apriori} (the label distribution) and +#' \code{tables} (conditional probabilities given the target label). +#' @rdname spark.naiveBayes +#' @export +#' @note summary(NaiveBayesModel) since 2.0.0 +setMethod("summary", signature(object = "NaiveBayesModel"), + function(object) { + jobj <- object@jobj + features <- callJMethod(jobj, "features") + labels <- callJMethod(jobj, "labels") + apriori <- callJMethod(jobj, "apriori") + apriori <- t(as.matrix(unlist(apriori))) + colnames(apriori) <- unlist(labels) + tables <- callJMethod(jobj, "tables") + tables <- matrix(tables, nrow = length(labels)) + rownames(tables) <- unlist(labels) + colnames(tables) <- unlist(features) + list(apriori = apriori, tables = tables) + }) + +# Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), +# similarly to R package e1071's predict. + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction". +#' @rdname spark.naiveBayes +#' @export +#' @note predict(NaiveBayesModel) since 2.0.0 +setMethod("predict", signature(object = "NaiveBayesModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the Bernoulli naive Bayes model to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.naiveBayes +#' @export +#' @seealso \link{write.ml} +#' @note write.ml(NaiveBayesModel, character) since 2.0.0 +setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R new file mode 100644 index 0000000000000..c44358838703f --- /dev/null +++ b/R/pkg/R/mllib_clustering.R @@ -0,0 +1,456 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_clustering.R: Provides methods for MLlib clustering algorithms integration + +#' S4 class that represents a GaussianMixtureModel +#' +#' @param jobj a Java object reference to the backing Scala GaussianMixtureModel +#' @export +#' @note GaussianMixtureModel since 2.1.0 +setClass("GaussianMixtureModel", representation(jobj = "jobj")) + +#' S4 class that represents a KMeansModel +#' +#' @param jobj a Java object reference to the backing Scala KMeansModel +#' @export +#' @note KMeansModel since 2.0.0 +setClass("KMeansModel", representation(jobj = "jobj")) + +#' S4 class that represents an LDAModel +#' +#' @param jobj a Java object reference to the backing Scala LDAWrapper +#' @export +#' @note LDAModel since 2.1.0 +setClass("LDAModel", representation(jobj = "jobj")) + +#' Multivariate Gaussian Mixture Model (GMM) +#' +#' Fits multivariate gaussian mixture model against a Spark DataFrame, similarly to R's +#' mvnormalmixEM(). Users can call \code{summary} to print a summary of the fitted model, +#' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} +#' to save/load fitted models. +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' Note that the response variable of formula is empty in spark.gaussianMixture. +#' @param k number of independent Gaussians in the mixture model. +#' @param maxIter maximum iteration number. +#' @param tol the convergence tolerance. +#' @param ... additional arguments passed to the method. +#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method +#' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian mixture model. +#' @rdname spark.gaussianMixture +#' @name spark.gaussianMixture +#' @seealso mixtools: \url{https://cran.r-project.org/package=mixtools} +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' library(mvtnorm) +#' set.seed(100) +#' a <- rmvnorm(4, c(0, 0)) +#' b <- rmvnorm(6, c(3, 4)) +#' data <- rbind(a, b) +#' df <- createDataFrame(as.data.frame(data)) +#' model <- spark.gaussianMixture(df, ~ V1 + V2, k = 2) +#' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "V1", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.gaussianMixture since 2.1.0 +#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} +setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, k = 2, maxIter = 100, tol = 0.01) { + formula <- paste(deparse(formula), collapse = "") + jobj <- callJStatic("org.apache.spark.ml.r.GaussianMixtureWrapper", "fit", data@sdf, + formula, as.integer(k), as.integer(maxIter), as.numeric(tol)) + new("GaussianMixtureModel", jobj = jobj) + }) + +# Get the summary of a multivariate gaussian mixture model + +#' @param object a fitted gaussian mixture model. +#' @return \code{summary} returns summary of the fitted model, which is a list. +#' The list includes the model's \code{lambda} (lambda), \code{mu} (mu), +#' \code{sigma} (sigma), and \code{posterior} (posterior). +#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method +#' @rdname spark.gaussianMixture +#' @export +#' @note summary(GaussianMixtureModel) since 2.1.0 +setMethod("summary", signature(object = "GaussianMixtureModel"), + function(object) { + jobj <- object@jobj + is.loaded <- callJMethod(jobj, "isLoaded") + lambda <- unlist(callJMethod(jobj, "lambda")) + muList <- callJMethod(jobj, "mu") + sigmaList <- callJMethod(jobj, "sigma") + k <- callJMethod(jobj, "k") + dim <- callJMethod(jobj, "dim") + mu <- c() + for (i in 1 : k) { + start <- (i - 1) * dim + 1 + end <- i * dim + mu[[i]] <- unlist(muList[start : end]) + } + sigma <- c() + for (i in 1 : k) { + start <- (i - 1) * dim * dim + 1 + end <- i * dim * dim + sigma[[i]] <- t(matrix(sigmaList[start : end], ncol = dim)) + } + posterior <- if (is.loaded) { + NULL + } else { + dataFrame(callJMethod(jobj, "posterior")) + } + list(lambda = lambda, mu = mu, sigma = sigma, + posterior = posterior, is.loaded = is.loaded) + }) + +# Predicted values based on a gaussian mixture model + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named +#' "prediction". +#' @aliases predict,GaussianMixtureModel,SparkDataFrame-method +#' @rdname spark.gaussianMixture +#' @export +#' @note predict(GaussianMixtureModel) since 2.1.0 +setMethod("predict", signature(object = "GaussianMixtureModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Save fitted MLlib model to the input path + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @aliases write.ml,GaussianMixtureModel,character-method +#' @rdname spark.gaussianMixture +#' @export +#' @note write.ml(GaussianMixtureModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' K-Means Clustering Model +#' +#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' Note that the response variable of formula is empty in spark.kmeans. +#' @param k number of centers. +#' @param maxIter maximum iteration number. +#' @param initMode the initialization algorithm choosen to fit the model. +#' @param ... additional argument(s) passed to the method. +#' @return \code{spark.kmeans} returns a fitted k-means model. +#' @rdname spark.kmeans +#' @aliases spark.kmeans,SparkDataFrame,formula-method +#' @name spark.kmeans +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' data(iris) +#' df <- createDataFrame(iris) +#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random") +#' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.kmeans since 2.0.0 +#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} +setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) { + formula <- paste(deparse(formula), collapse = "") + initMode <- match.arg(initMode) + jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula, + as.integer(k), as.integer(maxIter), initMode) + new("KMeansModel", jobj = jobj) + }) + +# Get the summary of a k-means model + +#' @param object a fitted k-means model. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes the model's \code{k} (number of cluster centers), +#' \code{coefficients} (model cluster centers), +#' \code{size} (number of data points in each cluster), and \code{cluster} +#' (cluster centers of the transformed data). +#' @rdname spark.kmeans +#' @export +#' @note summary(KMeansModel) since 2.0.0 +setMethod("summary", signature(object = "KMeansModel"), + function(object) { + jobj <- object@jobj + is.loaded <- callJMethod(jobj, "isLoaded") + features <- callJMethod(jobj, "features") + coefficients <- callJMethod(jobj, "coefficients") + k <- callJMethod(jobj, "k") + size <- callJMethod(jobj, "size") + coefficients <- t(matrix(coefficients, ncol = k)) + colnames(coefficients) <- unlist(features) + rownames(coefficients) <- 1:k + cluster <- if (is.loaded) { + NULL + } else { + dataFrame(callJMethod(jobj, "cluster")) + } + list(k = k, coefficients = coefficients, size = size, + cluster = cluster, is.loaded = is.loaded) + }) + +# Predicted values based on a k-means model + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns the predicted values based on a k-means model. +#' @rdname spark.kmeans +#' @export +#' @note predict(KMeansModel) since 2.0.0 +setMethod("predict", signature(object = "KMeansModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +#' Get fitted result from a k-means model +#' +#' Get fitted result from a k-means model, similarly to R's fitted(). +#' Note: A saved-loaded model does not support this method. +#' +#' @param object a fitted k-means model. +#' @param method type of fitted results, \code{"centers"} for cluster centers +#' or \code{"classes"} for assigned classes. +#' @param ... additional argument(s) passed to the method. +#' @return \code{fitted} returns a SparkDataFrame containing fitted values. +#' @rdname fitted +#' @export +#' @examples +#' \dontrun{ +#' model <- spark.kmeans(trainingData, ~ ., 2) +#' fitted.model <- fitted(model) +#' showDF(fitted.model) +#'} +#' @note fitted since 2.0.0 +setMethod("fitted", signature(object = "KMeansModel"), + function(object, method = c("centers", "classes")) { + method <- match.arg(method) + jobj <- object@jobj + is.loaded <- callJMethod(jobj, "isLoaded") + if (is.loaded) { + stop("Saved-loaded k-means model does not support 'fitted' method") + } else { + dataFrame(callJMethod(jobj, "fitted", method)) + } + }) + +# Save fitted MLlib model to the input path + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.kmeans +#' @export +#' @note write.ml(KMeansModel, character) since 2.0.0 +setMethod("write.ml", signature(object = "KMeansModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Latent Dirichlet Allocation +#' +#' \code{spark.lda} fits a Latent Dirichlet Allocation model on a SparkDataFrame. Users can call +#' \code{summary} to get a summary of the fitted LDA model, \code{spark.posterior} to compute +#' posterior probabilities on new data, \code{spark.perplexity} to compute log perplexity on new +#' data and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' +#' @param data A SparkDataFrame for training. +#' @param features Features column name. Either libSVM-format column or character-format column is +#' valid. +#' @param k Number of topics. +#' @param maxIter Maximum iterations. +#' @param optimizer Optimizer to train an LDA model, "online" or "em", default is "online". +#' @param subsamplingRate (For online optimizer) Fraction of the corpus to be sampled and used in +#' each iteration of mini-batch gradient descent, in range (0, 1]. +#' @param topicConcentration concentration parameter (commonly named \code{beta} or \code{eta}) for +#' the prior placed on topic distributions over terms, default -1 to set automatically on the +#' Spark side. Use \code{summary} to retrieve the effective topicConcentration. Only 1-size +#' numeric is accepted. +#' @param docConcentration concentration parameter (commonly named \code{alpha}) for the +#' prior placed on documents distributions over topics (\code{theta}), default -1 to set +#' automatically on the Spark side. Use \code{summary} to retrieve the effective +#' docConcentration. Only 1-size or \code{k}-size numeric is accepted. +#' @param customizedStopWords stopwords that need to be removed from the given corpus. Ignore the +#' parameter if libSVM-format column is used as the features column. +#' @param maxVocabSize maximum vocabulary size, default 1 << 18 +#' @param ... additional argument(s) passed to the method. +#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model. +#' @rdname spark.lda +#' @aliases spark.lda,SparkDataFrame-method +#' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels} +#' @export +#' @examples +#' \dontrun{ +#' # nolint start +#' # An example "path/to/file" can be +#' # paste0(Sys.getenv("SPARK_HOME"), "/data/mllib/sample_lda_libsvm_data.txt") +#' # nolint end +#' text <- read.df("path/to/file", source = "libsvm") +#' model <- spark.lda(data = text, optimizer = "em") +#' +#' # get a summary of the model +#' summary(model) +#' +#' # compute posterior probabilities +#' posterior <- spark.posterior(model, text) +#' showDF(posterior) +#' +#' # compute perplexity +#' perplexity <- spark.perplexity(model, text) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.lda since 2.1.0 +setMethod("spark.lda", signature(data = "SparkDataFrame"), + function(data, features = "features", k = 10, maxIter = 20, optimizer = c("online", "em"), + subsamplingRate = 0.05, topicConcentration = -1, docConcentration = -1, + customizedStopWords = "", maxVocabSize = bitwShiftL(1, 18)) { + optimizer <- match.arg(optimizer) + jobj <- callJStatic("org.apache.spark.ml.r.LDAWrapper", "fit", data@sdf, features, + as.integer(k), as.integer(maxIter), optimizer, + as.numeric(subsamplingRate), topicConcentration, + as.array(docConcentration), as.array(customizedStopWords), + maxVocabSize) + new("LDAModel", jobj = jobj) + }) + +# Returns the summary of a Latent Dirichlet Allocation model produced by \code{spark.lda} + +#' @param object A Latent Dirichlet Allocation model fitted by \code{spark.lda}. +#' @param maxTermsPerTopic Maximum number of terms to collect for each topic. Default value of 10. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes +#' \item{\code{docConcentration}}{concentration parameter commonly named \code{alpha} for +#' the prior placed on documents distributions over topics \code{theta}} +#' \item{\code{topicConcentration}}{concentration parameter commonly named \code{beta} or +#' \code{eta} for the prior placed on topic distributions over terms} +#' \item{\code{logLikelihood}}{log likelihood of the entire corpus} +#' \item{\code{logPerplexity}}{log perplexity} +#' \item{\code{isDistributed}}{TRUE for distributed model while FALSE for local model} +#' \item{\code{vocabSize}}{number of terms in the corpus} +#' \item{\code{topics}}{top 10 terms and their weights of all topics} +#' \item{\code{vocabulary}}{whole terms of the training corpus, NULL if libsvm format file +#' used as training set} +#' @rdname spark.lda +#' @aliases summary,LDAModel-method +#' @export +#' @note summary(LDAModel) since 2.1.0 +setMethod("summary", signature(object = "LDAModel"), + function(object, maxTermsPerTopic) { + maxTermsPerTopic <- as.integer(ifelse(missing(maxTermsPerTopic), 10, maxTermsPerTopic)) + jobj <- object@jobj + docConcentration <- callJMethod(jobj, "docConcentration") + topicConcentration <- callJMethod(jobj, "topicConcentration") + logLikelihood <- callJMethod(jobj, "logLikelihood") + logPerplexity <- callJMethod(jobj, "logPerplexity") + isDistributed <- callJMethod(jobj, "isDistributed") + vocabSize <- callJMethod(jobj, "vocabSize") + topics <- dataFrame(callJMethod(jobj, "topics", maxTermsPerTopic)) + vocabulary <- callJMethod(jobj, "vocabulary") + list(docConcentration = unlist(docConcentration), + topicConcentration = topicConcentration, + logLikelihood = logLikelihood, logPerplexity = logPerplexity, + isDistributed = isDistributed, vocabSize = vocabSize, + topics = topics, vocabulary = unlist(vocabulary)) + }) + +# Returns the log perplexity of a Latent Dirichlet Allocation model produced by \code{spark.lda} + +#' @return \code{spark.perplexity} returns the log perplexity of given SparkDataFrame, or the log +#' perplexity of the training data if missing argument "data". +#' @rdname spark.lda +#' @aliases spark.perplexity,LDAModel-method +#' @export +#' @note spark.perplexity(LDAModel) since 2.1.0 +setMethod("spark.perplexity", signature(object = "LDAModel", data = "SparkDataFrame"), + function(object, data) { + ifelse(missing(data), callJMethod(object@jobj, "logPerplexity"), + callJMethod(object@jobj, "computeLogPerplexity", data@sdf)) + }) + +# Returns posterior probabilities from a Latent Dirichlet Allocation model produced by spark.lda() + +#' @param newData A SparkDataFrame for testing. +#' @return \code{spark.posterior} returns a SparkDataFrame containing posterior probabilities +#' vectors named "topicDistribution". +#' @rdname spark.lda +#' @aliases spark.posterior,LDAModel,SparkDataFrame-method +#' @export +#' @note spark.posterior(LDAModel) since 2.1.0 +setMethod("spark.posterior", signature(object = "LDAModel", newData = "SparkDataFrame"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the Latent Dirichlet Allocation model to the input path. + +#' @param path The directory where the model is saved. +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.lda +#' @aliases write.ml,LDAModel,character-method +#' @export +#' @seealso \link{read.ml} +#' @note write.ml(LDAModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "LDAModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R new file mode 100644 index 0000000000000..fa794249085d7 --- /dev/null +++ b/R/pkg/R/mllib_recommendation.R @@ -0,0 +1,162 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_recommendation.R: Provides methods for MLlib recommendation algorithms integration + +#' S4 class that represents an ALSModel +#' +#' @param jobj a Java object reference to the backing Scala ALSWrapper +#' @export +#' @note ALSModel since 2.1.0 +setClass("ALSModel", representation(jobj = "jobj")) + +#' Alternating Least Squares (ALS) for Collaborative Filtering +#' +#' \code{spark.als} learns latent factors in collaborative filtering via alternating least +#' squares. Users can call \code{summary} to obtain fitted latent factors, \code{predict} +#' to make predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' +#' For more details, see +#' \href{http://spark.apache.org/docs/latest/ml-collaborative-filtering.html}{MLlib: +#' Collaborative Filtering}. +#' +#' @param data a SparkDataFrame for training. +#' @param ratingCol column name for ratings. +#' @param userCol column name for user ids. Ids must be (or can be coerced into) integers. +#' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers. +#' @param rank rank of the matrix factorization (> 0). +#' @param regParam regularization parameter (>= 0). +#' @param maxIter maximum number of iterations (>= 0). +#' @param nonnegative logical value indicating whether to apply nonnegativity constraints. +#' @param implicitPrefs logical value indicating whether to use implicit preference. +#' @param alpha alpha parameter in the implicit preference formulation (>= 0). +#' @param seed integer seed for random number generation. +#' @param numUserBlocks number of user blocks used to parallelize computation (> 0). +#' @param numItemBlocks number of item blocks used to parallelize computation (> 0). +#' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1). +#' @param ... additional argument(s) passed to the method. +#' @return \code{spark.als} returns a fitted ALS model. +#' @rdname spark.als +#' @aliases spark.als,SparkDataFrame-method +#' @name spark.als +#' @export +#' @examples +#' \dontrun{ +#' ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), +#' list(2, 1, 1.0), list(2, 2, 5.0)) +#' df <- createDataFrame(ratings, c("user", "item", "rating")) +#' model <- spark.als(df, "rating", "user", "item") +#' +#' # extract latent factors +#' stats <- summary(model) +#' userFactors <- stats$userFactors +#' itemFactors <- stats$itemFactors +#' +#' # make predictions +#' predicted <- predict(model, df) +#' showDF(predicted) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' +#' # set other arguments +#' modelS <- spark.als(df, "rating", "user", "item", rank = 20, +#' regParam = 0.1, nonnegative = TRUE) +#' statsS <- summary(modelS) +#' } +#' @note spark.als since 2.1.0 +setMethod("spark.als", signature(data = "SparkDataFrame"), + function(data, ratingCol = "rating", userCol = "user", itemCol = "item", + rank = 10, regParam = 0.1, maxIter = 10, nonnegative = FALSE, + implicitPrefs = FALSE, alpha = 1.0, numUserBlocks = 10, numItemBlocks = 10, + checkpointInterval = 10, seed = 0) { + + if (!is.numeric(rank) || rank <= 0) { + stop("rank should be a positive number.") + } + if (!is.numeric(regParam) || regParam < 0) { + stop("regParam should be a nonnegative number.") + } + if (!is.numeric(maxIter) || maxIter <= 0) { + stop("maxIter should be a positive number.") + } + + jobj <- callJStatic("org.apache.spark.ml.r.ALSWrapper", + "fit", data@sdf, ratingCol, userCol, itemCol, as.integer(rank), + regParam, as.integer(maxIter), implicitPrefs, alpha, nonnegative, + as.integer(numUserBlocks), as.integer(numItemBlocks), + as.integer(checkpointInterval), as.integer(seed)) + new("ALSModel", jobj = jobj) + }) + +# Returns a summary of the ALS model produced by spark.als. + +#' @param object a fitted ALS model. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes \code{user} (the names of the user column), +#' \code{item} (the item column), \code{rating} (the rating column), \code{userFactors} +#' (the estimated user factors), \code{itemFactors} (the estimated item factors), +#' and \code{rank} (rank of the matrix factorization model). +#' @rdname spark.als +#' @aliases summary,ALSModel-method +#' @export +#' @note summary(ALSModel) since 2.1.0 +setMethod("summary", signature(object = "ALSModel"), + function(object) { + jobj <- object@jobj + user <- callJMethod(jobj, "userCol") + item <- callJMethod(jobj, "itemCol") + rating <- callJMethod(jobj, "ratingCol") + userFactors <- dataFrame(callJMethod(jobj, "userFactors")) + itemFactors <- dataFrame(callJMethod(jobj, "itemFactors")) + rank <- callJMethod(jobj, "rank") + list(user = user, item = item, rating = rating, userFactors = userFactors, + itemFactors = itemFactors, rank = rank) + }) + +# Makes predictions from an ALS model or a model produced by spark.als. + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted values. +#' @rdname spark.als +#' @aliases predict,ALSModel-method +#' @export +#' @note predict(ALSModel) since 2.1.0 +setMethod("predict", signature(object = "ALSModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the ALS model to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite logical value indicating whether to overwrite if the output path +#' already exists. Default is FALSE which means throw exception +#' if the output path exists. +#' +#' @rdname spark.als +#' @aliases write.ml,ALSModel,character-method +#' @export +#' @seealso \link{read.ml} +#' @note write.ml(ALSModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "ALSModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R new file mode 100644 index 0000000000000..a480168a29126 --- /dev/null +++ b/R/pkg/R/mllib_regression.R @@ -0,0 +1,448 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_regression.R: Provides methods for MLlib regression algorithms +# (except for tree-based algorithms) integration + +#' S4 class that represents a AFTSurvivalRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper +#' @export +#' @note AFTSurvivalRegressionModel since 2.0.0 +setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a generalized linear model +#' +#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper +#' @export +#' @note GeneralizedLinearRegressionModel since 2.0.0 +setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents an IsotonicRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel +#' @export +#' @note IsotonicRegressionModel since 2.1.0 +setClass("IsotonicRegressionModel", representation(jobj = "jobj")) + +#' Generalized Linear Models +#' +#' Fits generalized linear model against a Spark DataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param family a description of the error distribution and link function to be used in the model. +#' This can be a character string naming a family function, a family function or +#' the result of a call to a family function. Refer R family at +#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. +#' @param tol positive convergence tolerance of iterations. +#' @param maxIter integer giving the maximal number of IRLS iterations. +#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance +#' weights as 1.0. +#' @param regParam regularization parameter for L2 regularization. +#' @param ... additional arguments passed to the method. +#' @aliases spark.glm,SparkDataFrame,formula-method +#' @return \code{spark.glm} returns a fitted generalized linear model. +#' @rdname spark.glm +#' @name spark.glm +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' data(iris) +#' df <- createDataFrame(iris) +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") +#' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.glm since 2.0.0 +#' @seealso \link{glm}, \link{read.ml} +setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, + regParam = 0.0) { + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + if (is.null(family$family)) { + print(family) + stop("'family' not recognized") + } + + formula <- paste(deparse(formula), collapse = "") + if (is.null(weightCol)) { + weightCol <- "" + } + + jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", + "fit", formula, data@sdf, family$family, family$link, + tol, as.integer(maxIter), as.character(weightCol), regParam) + new("GeneralizedLinearRegressionModel", jobj = jobj) + }) + +#' Generalized Linear Models (R-compliant) +#' +#' Fits a generalized linear model, similarly to R's glm(). +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param data a SparkDataFrame or R's glm data for training. +#' @param family a description of the error distribution and link function to be used in the model. +#' This can be a character string naming a family function, a family function or +#' the result of a call to a family function. Refer R family at +#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. +#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance +#' weights as 1.0. +#' @param epsilon positive convergence tolerance of iterations. +#' @param maxit integer giving the maximal number of IRLS iterations. +#' @return \code{glm} returns a fitted generalized linear model. +#' @rdname glm +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' data(iris) +#' df <- createDataFrame(iris) +#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian") +#' summary(model) +#' } +#' @note glm since 1.5.0 +#' @seealso \link{spark.glm} +setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), + function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) { + spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol) + }) + +# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). + +#' @param object a fitted generalized linear model. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list of components includes at least the \code{coefficients} (coefficients matrix, which includes +#' coefficients, standard error of coefficients, t value and p value), +#' \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC) +#' and \code{iter} (number of iterations IRLS takes). If there are collinear columns in the data, +#' the coefficients matrix only provides coefficients. +#' @rdname spark.glm +#' @export +#' @note summary(GeneralizedLinearRegressionModel) since 2.0.0 +setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), + function(object) { + jobj <- object@jobj + is.loaded <- callJMethod(jobj, "isLoaded") + features <- callJMethod(jobj, "rFeatures") + coefficients <- callJMethod(jobj, "rCoefficients") + dispersion <- callJMethod(jobj, "rDispersion") + null.deviance <- callJMethod(jobj, "rNullDeviance") + deviance <- callJMethod(jobj, "rDeviance") + df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull") + df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom") + aic <- callJMethod(jobj, "rAic") + iter <- callJMethod(jobj, "rNumIterations") + family <- callJMethod(jobj, "rFamily") + deviance.resid <- if (is.loaded) { + NULL + } else { + dataFrame(callJMethod(jobj, "rDevianceResiduals")) + } + # If the underlying WeightedLeastSquares using "normal" solver, we can provide + # coefficients, standard error of coefficients, t value and p value. Otherwise, + # it will be fitted by local "l-bfgs", we can only provide coefficients. + if (length(features) == length(coefficients)) { + coefficients <- matrix(coefficients, ncol = 1) + colnames(coefficients) <- c("Estimate") + rownames(coefficients) <- unlist(features) + } else { + coefficients <- matrix(coefficients, ncol = 4) + colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") + rownames(coefficients) <- unlist(features) + } + ans <- list(deviance.resid = deviance.resid, coefficients = coefficients, + dispersion = dispersion, null.deviance = null.deviance, + deviance = deviance, df.null = df.null, df.residual = df.residual, + aic = aic, iter = iter, family = family, is.loaded = is.loaded) + class(ans) <- "summary.GeneralizedLinearRegressionModel" + ans + }) + +# Prints the summary of GeneralizedLinearRegressionModel + +#' @rdname spark.glm +#' @param x summary object of fitted generalized linear model returned by \code{summary} function. +#' @export +#' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0 +print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { + if (x$is.loaded) { + cat("\nSaved-loaded model does not support output 'Deviance Residuals'.\n") + } else { + x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals", + c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max")) + x$deviance.resid <- zapsmall(x$deviance.resid, 5L) + cat("\nDeviance Residuals: \n") + cat("(Note: These are approximate quantiles with relative error <= 0.01)\n") + print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L) + } + + cat("\nCoefficients:\n") + print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L) + + cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion), + ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"), + format(unlist(x[c("null.deviance", "deviance")]), digits = 5L), + " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"), + 1L, paste, collapse = " "), sep = "") + cat("AIC: ", format(x$aic, digits = 4L), "\n\n", + "Number of Fisher Scoring iterations: ", x$iter, "\n\n", sep = "") + invisible(x) + } + +# Makes predictions from a generalized linear model produced by glm() or spark.glm(), +# similarly to R's predict(). + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named +#' "prediction". +#' @rdname spark.glm +#' @export +#' @note predict(GeneralizedLinearRegressionModel) since 1.5.0 +setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the generalized linear model to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.glm +#' @export +#' @note write.ml(GeneralizedLinearRegressionModel, character) since 2.0.0 +setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Isotonic Regression Model +#' +#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to R's isoreg(). +#' Users can print, make predictions on the produced model and save the model to the input path. +#' +#' @param data SparkDataFrame for training. +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param isotonic Whether the output sequence should be isotonic/increasing (TRUE) or +#' antitonic/decreasing (FALSE). +#' @param featureIndex The index of the feature if \code{featuresCol} is a vector column +#' (default: 0), no effect otherwise. +#' @param weightCol The weight column name. +#' @param ... additional arguments passed to the method. +#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model. +#' @rdname spark.isoreg +#' @aliases spark.isoreg,SparkDataFrame,formula-method +#' @name spark.isoreg +#' @export +#' @examples +#' \dontrun{ +#' sparkR.session() +#' data <- list(list(7.0, 0.0), list(5.0, 1.0), list(3.0, 2.0), +#' list(5.0, 3.0), list(1.0, 4.0)) +#' df <- createDataFrame(data, c("label", "feature")) +#' model <- spark.isoreg(df, label ~ feature, isotonic = FALSE) +#' # return model boundaries and prediction as lists +#' result <- summary(model, df) +#' # prediction based on fitted model +#' predict_data <- list(list(-2.0), list(-1.0), list(0.5), +#' list(0.75), list(1.0), list(2.0), list(9.0)) +#' predict_df <- createDataFrame(predict_data, c("feature")) +#' # get prediction column +#' predict_result <- collect(select(predict(model, predict_df), "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.isoreg since 2.1.0 +setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) { + formula <- paste(deparse(formula), collapse = "") + + if (is.null(weightCol)) { + weightCol <- "" + } + + jobj <- callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit", + data@sdf, formula, as.logical(isotonic), as.integer(featureIndex), + as.character(weightCol)) + new("IsotonicRegressionModel", jobj = jobj) + }) + +# Get the summary of an IsotonicRegressionModel model + +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes model's \code{boundaries} (boundaries in increasing order) +#' and \code{predictions} (predictions associated with the boundaries at the same index). +#' @rdname spark.isoreg +#' @aliases summary,IsotonicRegressionModel-method +#' @export +#' @note summary(IsotonicRegressionModel) since 2.1.0 +setMethod("summary", signature(object = "IsotonicRegressionModel"), + function(object) { + jobj <- object@jobj + boundaries <- callJMethod(jobj, "boundaries") + predictions <- callJMethod(jobj, "predictions") + list(boundaries = boundaries, predictions = predictions) + }) + +# Predicted values based on an isotonicRegression model + +#' @param object a fitted IsotonicRegressionModel. +#' @param newData SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted values. +#' @rdname spark.isoreg +#' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method +#' @export +#' @note predict(IsotonicRegressionModel) since 2.1.0 +setMethod("predict", signature(object = "IsotonicRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Save fitted IsotonicRegressionModel to the input path + +#' @param path The directory where the model is saved. +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.isoreg +#' @aliases write.ml,IsotonicRegressionModel,character-method +#' @export +#' @note write.ml(IsotonicRegression, character) since 2.1.0 +setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Accelerated Failure Time (AFT) Survival Regression Model +#' +#' \code{spark.survreg} fits an accelerated failure time (AFT) survival regression model on +#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted AFT model, +#' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to +#' save/load fitted models. +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', ':', '+', and '-'. +#' Note that operator '.' is not supported currently. +#' @return \code{spark.survreg} returns a fitted AFT survival regression model. +#' @rdname spark.survreg +#' @seealso survival: \url{https://cran.r-project.org/package=survival} +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame(ovarian) +#' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx) +#' +#' # get a summary of the model +#' summary(model) +#' +#' # make predictions +#' predicted <- predict(model, df) +#' showDF(predicted) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' } +#' @note spark.survreg since 2.0.0 +setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula) { + formula <- paste(deparse(formula), collapse = "") + jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper", + "fit", formula, data@sdf) + new("AFTSurvivalRegressionModel", jobj = jobj) + }) + +# Returns a summary of the AFT survival regression model produced by spark.survreg, +# similarly to R's summary(). + +#' @param object a fitted AFT survival regression model. +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list includes the model's \code{coefficients} (features, coefficients, +#' intercept and log(scale)). +#' @rdname spark.survreg +#' @export +#' @note summary(AFTSurvivalRegressionModel) since 2.0.0 +setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), + function(object) { + jobj <- object@jobj + features <- callJMethod(jobj, "rFeatures") + coefficients <- callJMethod(jobj, "rCoefficients") + coefficients <- as.matrix(unlist(coefficients)) + colnames(coefficients) <- c("Value") + rownames(coefficients) <- unlist(features) + list(coefficients = coefficients) + }) + +# Makes predictions from an AFT survival regression model or a model produced by +# spark.survreg, similarly to R package survival's predict. + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted values +#' on the original scale of the data (mean predicted value at scale = 1.0). +#' @rdname spark.survreg +#' @export +#' @note predict(AFTSurvivalRegressionModel) since 2.0.0 +setMethod("predict", signature(object = "AFTSurvivalRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Saves the AFT survival regression model to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' @rdname spark.survreg +#' @export +#' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0 +#' @seealso \link{write.ml} +setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) diff --git a/R/pkg/R/mllib_stat.R b/R/pkg/R/mllib_stat.R new file mode 100644 index 0000000000000..3e013f1d45e38 --- /dev/null +++ b/R/pkg/R/mllib_stat.R @@ -0,0 +1,127 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_stat.R: Provides methods for MLlib statistics algorithms integration + +#' S4 class that represents an KSTest +#' +#' @param jobj a Java object reference to the backing Scala KSTestWrapper +#' @export +#' @note KSTest since 2.1.0 +setClass("KSTest", representation(jobj = "jobj")) + +#' (One-Sample) Kolmogorov-Smirnov Test +#' +#' @description +#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a +#' continuous distribution. +#' +#' By comparing the largest difference between the empirical cumulative +#' distribution of the sample data and the theoretical distribution we can provide a test for the +#' the null hypothesis that the sample data comes from that theoretical distribution. +#' +#' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest} +#' to print out a summary result. +#' +#' @param data a SparkDataFrame of user data. +#' @param testCol column name where the test data is from. It should be a column of double type. +#' @param nullHypothesis name of the theoretical distribution tested against. Currently only +#' \code{"norm"} for normal distribution is supported. +#' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"}, +#' we can provide as a vector the mean and standard deviation of +#' the distribution. If none is provided, then standard normal will be used. +#' If only one is provided, then the standard deviation will be set to be one. +#' @param ... additional argument(s) passed to the method. +#' @return \code{spark.kstest} returns a test result object. +#' @rdname spark.kstest +#' @aliases spark.kstest,SparkDataFrame-method +#' @name spark.kstest +#' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{ +#' MLlib: Hypothesis Testing} +#' @export +#' @examples +#' \dontrun{ +#' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25)) +#' df <- createDataFrame(data) +#' test <- spark.kstest(df, "test", "norm", c(0, 1)) +#' +#' # get a summary of the test result +#' testSummary <- summary(test) +#' testSummary +#' +#' # print out the summary in an organized way +#' print.summary.KSTest(testSummary) +#' } +#' @note spark.kstest since 2.1.0 +setMethod("spark.kstest", signature(data = "SparkDataFrame"), + function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { + tryCatch(match.arg(nullHypothesis), + error = function(e) { + msg <- paste("Distribution", nullHypothesis, "is not supported.") + stop(msg) + }) + if (nullHypothesis == "norm") { + distParams <- as.numeric(distParams) + mu <- ifelse(length(distParams) < 1, 0, distParams[1]) + sigma <- ifelse(length(distParams) < 2, 1, distParams[2]) + jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper", + "test", data@sdf, testCol, nullHypothesis, + as.array(c(mu, sigma))) + new("KSTest", jobj = jobj) + } +}) + +# Get the summary of Kolmogorov-Smirnov (KS) Test. + +#' @param object test result object of KSTest by \code{spark.kstest}. +#' @return \code{summary} returns summary information of KSTest object, which is a list. +#' The list includes the \code{p.value} (p-value), \code{statistic} (test statistic +#' computed for the test), \code{nullHypothesis} (the null hypothesis with its +#' parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test). +#' @rdname spark.kstest +#' @aliases summary,KSTest-method +#' @export +#' @note summary(KSTest) since 2.1.0 +setMethod("summary", signature(object = "KSTest"), + function(object) { + jobj <- object@jobj + pValue <- callJMethod(jobj, "pValue") + statistic <- callJMethod(jobj, "statistic") + nullHypothesis <- callJMethod(jobj, "nullHypothesis") + distName <- callJMethod(jobj, "distName") + distParams <- unlist(callJMethod(jobj, "distParams")) + degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom") + + ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis, + nullHypothesis.name = distName, nullHypothesis.parameters = distParams, + degreesOfFreedom = degreesOfFreedom, jobj = jobj) + class(ans) <- "summary.KSTest" + ans + }) + +# Prints the summary of KSTest + +#' @rdname spark.kstest +#' @param x summary object of KSTest returned by \code{summary}. +#' @export +#' @note print.summary.KSTest since 2.1.0 +print.summary.KSTest <- function(x, ...) { + jobj <- x$jobj + summaryStr <- callJMethod(jobj, "summary") + cat(summaryStr, "\n") + invisible(x) +} diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R new file mode 100644 index 0000000000000..0d53fad061809 --- /dev/null +++ b/R/pkg/R/mllib_tree.R @@ -0,0 +1,496 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_tree.R: Provides methods for MLlib tree-based algorithms integration + +#' S4 class that represents a GBTRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala GBTRegressionModel +#' @export +#' @note GBTRegressionModel since 2.1.0 +setClass("GBTRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a GBTClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala GBTClassificationModel +#' @export +#' @note GBTClassificationModel since 2.1.0 +setClass("GBTClassificationModel", representation(jobj = "jobj")) + +#' S4 class that represents a RandomForestRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala RandomForestRegressionModel +#' @export +#' @note RandomForestRegressionModel since 2.1.0 +setClass("RandomForestRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a RandomForestClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala RandomForestClassificationModel +#' @export +#' @note RandomForestClassificationModel since 2.1.0 +setClass("RandomForestClassificationModel", representation(jobj = "jobj")) + +# Create the summary of a tree ensemble model (eg. Random Forest, GBT) +summary.treeEnsemble <- function(model) { + jobj <- model@jobj + formula <- callJMethod(jobj, "formula") + numFeatures <- callJMethod(jobj, "numFeatures") + features <- callJMethod(jobj, "features") + featureImportances <- callJMethod(callJMethod(jobj, "featureImportances"), "toString") + numTrees <- callJMethod(jobj, "numTrees") + treeWeights <- callJMethod(jobj, "treeWeights") + list(formula = formula, + numFeatures = numFeatures, + features = features, + featureImportances = featureImportances, + numTrees = numTrees, + treeWeights = treeWeights, + jobj = jobj) +} + +# Prints the summary of tree ensemble models (eg. Random Forest, GBT) +print.summary.treeEnsemble <- function(x) { + jobj <- x$jobj + cat("Formula: ", x$formula) + cat("\nNumber of features: ", x$numFeatures) + cat("\nFeatures: ", unlist(x$features)) + cat("\nFeature importances: ", x$featureImportances) + cat("\nNumber of trees: ", x$numTrees) + cat("\nTree weights: ", unlist(x$treeWeights)) + + summaryStr <- callJMethod(jobj, "summary") + cat("\n", summaryStr, "\n") + invisible(x) +} + +#' Gradient Boosted Tree Model for Regression and Classification +#' +#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or Classification model on a +#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted +#' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and +#' \code{write.ml}/\code{read.ml} to save/load fitted models. +#' For more details, see +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{ +#' GBT Regression} and +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{ +#' GBT Classification} +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', ':', '+', and '-'. +#' @param type type of model, one of "regression" or "classification", to fit +#' @param maxDepth Maximum depth of the tree (>= 0). +#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing +#' how to split on features at each node. More bins give higher granularity. Must be +#' >= 2 and >= number of categories in any categorical feature. +#' @param maxIter Param for maximum number of iterations (>= 0). +#' @param stepSize Param for Step size to be used for each iteration of optimization. +#' @param lossType Loss function which GBT tries to minimize. +#' For classification, must be "logistic". For regression, must be one of +#' "squared" (L2) and "absolute" (L1), default is "squared". +#' @param seed integer seed for random number generation. +#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in +#' range (0, 1]. +#' @param minInstancesPerNode Minimum number of instances each child must have after split. If a +#' split causes the left or right child to have fewer than +#' minInstancesPerNode, the split will be discarded as invalid. Should be +#' >= 1. +#' @param minInfoGain Minimum information gain for a split to be considered at a tree node. +#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1). +#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. +#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with +#' nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching +#' can speed up training of deeper trees. Users can set how often should the +#' cache be checkpointed or disable it by setting checkpointInterval. +#' @param ... additional arguments passed to the method. +#' @aliases spark.gbt,SparkDataFrame,formula-method +#' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model. +#' @rdname spark.gbt +#' @name spark.gbt +#' @export +#' @examples +#' \dontrun{ +#' # fit a Gradient Boosted Tree Regression Model +#' df <- createDataFrame(longley) +#' model <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16) +#' +#' # get the summary of the model +#' summary(model) +#' +#' # make predictions +#' predictions <- predict(model, df) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' +#' # fit a Gradient Boosted Tree Classification Model +#' # label must be binary - Only binary classification is supported for GBT. +#' df <- createDataFrame(iris[iris$Species != "virginica", ]) +#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification") +#' +#' # numeric label is also supported +#' iris2 <- iris[iris$Species != "virginica", ] +#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1) +#' df <- createDataFrame(iris2) +#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification") +#' } +#' @note spark.gbt since 2.1.0 +setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, type = c("regression", "classification"), + maxDepth = 5, maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL, + seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0, + checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE) { + type <- match.arg(type) + formula <- paste(deparse(formula), collapse = "") + if (!is.null(seed)) { + seed <- as.character(as.integer(seed)) + } + switch(type, + regression = { + if (is.null(lossType)) lossType <- "squared" + lossType <- match.arg(lossType, c("squared", "absolute")) + jobj <- callJStatic("org.apache.spark.ml.r.GBTRegressorWrapper", + "fit", data@sdf, formula, as.integer(maxDepth), + as.integer(maxBins), as.integer(maxIter), + as.numeric(stepSize), as.integer(minInstancesPerNode), + as.numeric(minInfoGain), as.integer(checkpointInterval), + lossType, seed, as.numeric(subsamplingRate), + as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + new("GBTRegressionModel", jobj = jobj) + }, + classification = { + if (is.null(lossType)) lossType <- "logistic" + lossType <- match.arg(lossType, "logistic") + jobj <- callJStatic("org.apache.spark.ml.r.GBTClassifierWrapper", + "fit", data@sdf, formula, as.integer(maxDepth), + as.integer(maxBins), as.integer(maxIter), + as.numeric(stepSize), as.integer(minInstancesPerNode), + as.numeric(minInfoGain), as.integer(checkpointInterval), + lossType, seed, as.numeric(subsamplingRate), + as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + new("GBTClassificationModel", jobj = jobj) + } + ) + }) + +# Get the summary of a Gradient Boosted Tree Regression Model + +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list of components includes \code{formula} (formula), +#' \code{numFeatures} (number of features), \code{features} (list of features), +#' \code{featureImportances} (feature importances), \code{numTrees} (number of trees), +#' and \code{treeWeights} (tree weights). +#' @rdname spark.gbt +#' @aliases summary,GBTRegressionModel-method +#' @export +#' @note summary(GBTRegressionModel) since 2.1.0 +setMethod("summary", signature(object = "GBTRegressionModel"), + function(object) { + ans <- summary.treeEnsemble(object) + class(ans) <- "summary.GBTRegressionModel" + ans + }) + +# Prints the summary of Gradient Boosted Tree Regression Model + +#' @param x summary object of Gradient Boosted Tree regression model or classification model +#' returned by \code{summary}. +#' @rdname spark.gbt +#' @export +#' @note print.summary.GBTRegressionModel since 2.1.0 +print.summary.GBTRegressionModel <- function(x, ...) { + print.summary.treeEnsemble(x) +} + +# Get the summary of a Gradient Boosted Tree Classification Model + +#' @rdname spark.gbt +#' @aliases summary,GBTClassificationModel-method +#' @export +#' @note summary(GBTClassificationModel) since 2.1.0 +setMethod("summary", signature(object = "GBTClassificationModel"), + function(object) { + ans <- summary.treeEnsemble(object) + class(ans) <- "summary.GBTClassificationModel" + ans + }) + +# Prints the summary of Gradient Boosted Tree Classification Model + +#' @rdname spark.gbt +#' @export +#' @note print.summary.GBTClassificationModel since 2.1.0 +print.summary.GBTClassificationModel <- function(x, ...) { + print.summary.treeEnsemble(x) +} + +# Makes predictions from a Gradient Boosted Tree Regression model or Classification model + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction". +#' @rdname spark.gbt +#' @aliases predict,GBTRegressionModel-method +#' @export +#' @note predict(GBTRegressionModel) since 2.1.0 +setMethod("predict", signature(object = "GBTRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +#' @rdname spark.gbt +#' @aliases predict,GBTClassificationModel-method +#' @export +#' @note predict(GBTClassificationModel) since 2.1.0 +setMethod("predict", signature(object = "GBTClassificationModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Save the Gradient Boosted Tree Regression or Classification model to the input path. + +#' @param object A fitted Gradient Boosted Tree regression model or classification model. +#' @param path The directory where the model is saved. +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' @aliases write.ml,GBTRegressionModel,character-method +#' @rdname spark.gbt +#' @export +#' @note write.ml(GBTRegressionModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "GBTRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' @aliases write.ml,GBTClassificationModel,character-method +#' @rdname spark.gbt +#' @export +#' @note write.ml(GBTClassificationModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "GBTClassificationModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' Random Forest Model for Regression and Classification +#' +#' \code{spark.randomForest} fits a Random Forest Regression model or Classification model on +#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted Random Forest +#' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to +#' save/load fitted models. +#' For more details, see +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{ +#' Random Forest Regression} and +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{ +#' Random Forest Classification} +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', ':', '+', and '-'. +#' @param type type of model, one of "regression" or "classification", to fit +#' @param maxDepth Maximum depth of the tree (>= 0). +#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing +#' how to split on features at each node. More bins give higher granularity. Must be +#' >= 2 and >= number of categories in any categorical feature. +#' @param numTrees Number of trees to train (>= 1). +#' @param impurity Criterion used for information gain calculation. +#' For regression, must be "variance". For classification, must be one of +#' "entropy" and "gini", default is "gini". +#' @param featureSubsetStrategy The number of features to consider for splits at each tree node. +#' Supported options: "auto", "all", "onethird", "sqrt", "log2", (0.0-1.0], [1-n]. +#' @param seed integer seed for random number generation. +#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in +#' range (0, 1]. +#' @param minInstancesPerNode Minimum number of instances each child must have after split. +#' @param minInfoGain Minimum information gain for a split to be considered at a tree node. +#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1). +#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. +#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with +#' nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching +#' can speed up training of deeper trees. Users can set how often should the +#' cache be checkpointed or disable it by setting checkpointInterval. +#' @param ... additional arguments passed to the method. +#' @aliases spark.randomForest,SparkDataFrame,formula-method +#' @return \code{spark.randomForest} returns a fitted Random Forest model. +#' @rdname spark.randomForest +#' @name spark.randomForest +#' @export +#' @examples +#' \dontrun{ +#' # fit a Random Forest Regression Model +#' df <- createDataFrame(longley) +#' model <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16) +#' +#' # get the summary of the model +#' summary(model) +#' +#' # make predictions +#' predictions <- predict(model, df) +#' +#' # save and load the model +#' path <- "path/to/model" +#' write.ml(model, path) +#' savedModel <- read.ml(path) +#' summary(savedModel) +#' +#' # fit a Random Forest Classification Model +#' df <- createDataFrame(iris) +#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification") +#' } +#' @note spark.randomForest since 2.1.0 +setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, type = c("regression", "classification"), + maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL, + featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0, + minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10, + maxMemoryInMB = 256, cacheNodeIds = FALSE) { + type <- match.arg(type) + formula <- paste(deparse(formula), collapse = "") + if (!is.null(seed)) { + seed <- as.character(as.integer(seed)) + } + switch(type, + regression = { + if (is.null(impurity)) impurity <- "variance" + impurity <- match.arg(impurity, "variance") + jobj <- callJStatic("org.apache.spark.ml.r.RandomForestRegressorWrapper", + "fit", data@sdf, formula, as.integer(maxDepth), + as.integer(maxBins), as.integer(numTrees), + impurity, as.integer(minInstancesPerNode), + as.numeric(minInfoGain), as.integer(checkpointInterval), + as.character(featureSubsetStrategy), seed, + as.numeric(subsamplingRate), + as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + new("RandomForestRegressionModel", jobj = jobj) + }, + classification = { + if (is.null(impurity)) impurity <- "gini" + impurity <- match.arg(impurity, c("gini", "entropy")) + jobj <- callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper", + "fit", data@sdf, formula, as.integer(maxDepth), + as.integer(maxBins), as.integer(numTrees), + impurity, as.integer(minInstancesPerNode), + as.numeric(minInfoGain), as.integer(checkpointInterval), + as.character(featureSubsetStrategy), seed, + as.numeric(subsamplingRate), + as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + new("RandomForestClassificationModel", jobj = jobj) + } + ) + }) + +# Get the summary of a Random Forest Regression Model + +#' @return \code{summary} returns summary information of the fitted model, which is a list. +#' The list of components includes \code{formula} (formula), +#' \code{numFeatures} (number of features), \code{features} (list of features), +#' \code{featureImportances} (feature importances), \code{numTrees} (number of trees), +#' and \code{treeWeights} (tree weights). +#' @rdname spark.randomForest +#' @aliases summary,RandomForestRegressionModel-method +#' @export +#' @note summary(RandomForestRegressionModel) since 2.1.0 +setMethod("summary", signature(object = "RandomForestRegressionModel"), + function(object) { + ans <- summary.treeEnsemble(object) + class(ans) <- "summary.RandomForestRegressionModel" + ans + }) + +# Prints the summary of Random Forest Regression Model + +#' @param x summary object of Random Forest regression model or classification model +#' returned by \code{summary}. +#' @rdname spark.randomForest +#' @export +#' @note print.summary.RandomForestRegressionModel since 2.1.0 +print.summary.RandomForestRegressionModel <- function(x, ...) { + print.summary.treeEnsemble(x) +} + +# Get the summary of a Random Forest Classification Model + +#' @rdname spark.randomForest +#' @aliases summary,RandomForestClassificationModel-method +#' @export +#' @note summary(RandomForestClassificationModel) since 2.1.0 +setMethod("summary", signature(object = "RandomForestClassificationModel"), + function(object) { + ans <- summary.treeEnsemble(object) + class(ans) <- "summary.RandomForestClassificationModel" + ans + }) + +# Prints the summary of Random Forest Classification Model + +#' @rdname spark.randomForest +#' @export +#' @note print.summary.RandomForestClassificationModel since 2.1.0 +print.summary.RandomForestClassificationModel <- function(x, ...) { + print.summary.treeEnsemble(x) +} + +# Makes predictions from a Random Forest Regression model or Classification model + +#' @param newData a SparkDataFrame for testing. +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction". +#' @rdname spark.randomForest +#' @aliases predict,RandomForestRegressionModel-method +#' @export +#' @note predict(RandomForestRegressionModel) since 2.1.0 +setMethod("predict", signature(object = "RandomForestRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +#' @rdname spark.randomForest +#' @aliases predict,RandomForestClassificationModel-method +#' @export +#' @note predict(RandomForestClassificationModel) since 2.1.0 +setMethod("predict", signature(object = "RandomForestClassificationModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Save the Random Forest Regression or Classification model to the input path. + +#' @param object A fitted Random Forest regression model or classification model. +#' @param path The directory where the model is saved. +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @aliases write.ml,RandomForestRegressionModel,character-method +#' @rdname spark.randomForest +#' @export +#' @note write.ml(RandomForestRegressionModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "RandomForestRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + +#' @aliases write.ml,RandomForestClassificationModel,character-method +#' @rdname spark.randomForest +#' @export +#' @note write.ml(RandomForestClassificationModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "RandomForestClassificationModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R new file mode 100644 index 0000000000000..720ee41c58065 --- /dev/null +++ b/R/pkg/R/mllib_utils.R @@ -0,0 +1,119 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_utils.R: Utilities for MLlib integration + +# Integration with R's standard functions. +# Most of MLlib's argorithms are provided in two flavours: +# - a specialization of the default R methods (glm). These methods try to respect +# the inputs and the outputs of R's method to the largest extent, but some small differences +# may exist. +# - a set of methods that reflect the arguments of the other languages supported by Spark. These +# methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. + +#' Saves the MLlib model to the input path +#' +#' Saves the MLlib model to the input path. For more information, see the specific +#' MLlib model below. +#' @rdname write.ml +#' @name write.ml +#' @export +#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg}, +#' @seealso \link{spark.kmeans}, +#' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, +#' @seealso \link{spark.randomForest}, \link{spark.survreg}, +#' @seealso \link{read.ml} +NULL + +#' Makes predictions from a MLlib model +#' +#' Makes predictions from a MLlib model. For more information, see the specific +#' MLlib model below. +#' @rdname predict +#' @name predict +#' @export +#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg}, +#' @seealso \link{spark.kmeans}, +#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, +#' @seealso \link{spark.randomForest}, \link{spark.survreg} +NULL + +write_internal <- function(object, path, overwrite = FALSE) { + writer <- callJMethod(object@jobj, "write") + if (overwrite) { + writer <- callJMethod(writer, "overwrite") + } + invisible(callJMethod(writer, "save", path)) +} + +predict_internal <- function(object, newData) { + dataFrame(callJMethod(object@jobj, "transform", newData@sdf)) +} + +#' Load a fitted MLlib model from the input path. +#' +#' @param path path of the model to read. +#' @return A fitted MLlib model. +#' @rdname read.ml +#' @name read.ml +#' @export +#' @seealso \link{write.ml} +#' @examples +#' \dontrun{ +#' path <- "path/to/model" +#' model <- read.ml(path) +#' } +#' @note read.ml since 2.0.0 +read.ml <- function(path) { + path <- suppressWarnings(normalizePath(path)) + sparkSession <- getSparkSession() + callJStatic("org.apache.spark.ml.r.RWrappers", "session", sparkSession) + jobj <- callJStatic("org.apache.spark.ml.r.RWrappers", "load", path) + if (isInstanceOf(jobj, "org.apache.spark.ml.r.NaiveBayesWrapper")) { + new("NaiveBayesModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper")) { + new("AFTSurvivalRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper")) { + new("GeneralizedLinearRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.KMeansWrapper")) { + new("KMeansModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LDAWrapper")) { + new("LDAModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper")) { + new("MultilayerPerceptronClassificationModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.IsotonicRegressionWrapper")) { + new("IsotonicRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GaussianMixtureWrapper")) { + new("GaussianMixtureModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) { + new("ALSModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LogisticRegressionWrapper")) { + new("LogisticRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestRegressorWrapper")) { + new("RandomForestRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestClassifierWrapper")) { + new("RandomForestClassificationModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTRegressorWrapper")) { + new("GBTRegressionModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTClassifierWrapper")) { + new("GBTClassificationModel", jobj = jobj) + } else { + stop("Unsupported model: ", jobj) + } +} diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R deleted file mode 100644 index 0f0d831c6f7c0..0000000000000 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ /dev/null @@ -1,1170 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("MLlib functions") - -# Tests for MLlib functions in SparkR -sparkSession <- sparkR.session(enableHiveSupport = FALSE) - -absoluteSparkPath <- function(x) { - sparkHome <- sparkR.conf("spark.home") - file.path(sparkHome, x) -} - -test_that("formula of spark.glm", { - training <- suppressWarnings(createDataFrame(iris)) - # directly calling the spark API - # dot minus and intercept vs native glm - model <- spark.glm(training, Sepal_Width ~ . - Species + 0) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # feature interaction vs native glm - model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # glm should work with long formula - training <- suppressWarnings(createDataFrame(iris)) - training$LongLongLongLongLongName <- training$Sepal_Width - training$VeryLongLongLongLonLongName <- training$Sepal_Length - training$AnotherLongLongLongLongName <- training$Species - model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName + - AnotherLongLongLongLongName) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - -test_that("spark.glm and predict", { - training <- suppressWarnings(createDataFrame(iris)) - # gaussian family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # poisson family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = poisson(link = identity)) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, - data = iris, family = poisson(link = identity)), iris)) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # Test stats::predict is working - x <- rnorm(15) - y <- x + rnorm(15) - expect_equal(length(predict(lm(y ~ x))), 15) -}) - -test_that("spark.glm summary", { - # gaussian family - training <- suppressWarnings(createDataFrame(iris)) - stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species)) - - rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-4)) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - - out <- capture.output(print(stats)) - expect_match(out[2], "Deviance Residuals:") - expect_true(any(grepl("AIC: 59.22", out))) - - # binomial family - df <- suppressWarnings(createDataFrame(iris)) - training <- df[df$Species %in% c("versicolor", "virginica"), ] - stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width, - family = binomial(link = "logit"))) - - rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] - rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, - family = binomial(link = "logit"))) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-4)) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "Sepal_Length", "Sepal_Width"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - - # Test spark.glm works with weighted dataset - a1 <- c(0, 1, 2, 3) - a2 <- c(5, 2, 1, 3) - w <- c(1, 2, 3, 4) - b <- c(1, 0, 1, 0) - data <- as.data.frame(cbind(a1, a2, w, b)) - df <- createDataFrame(data) - - stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w")) - rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w)) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-3)) - expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - - # Test summary works on base GLM models - baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) - baseSummary <- summary(baseModel) - expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) - - # Test spark.glm works with regularization parameter - data <- as.data.frame(cbind(a1, a2, b)) - df <- suppressWarnings(createDataFrame(data)) - regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0)) - expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result - - # Test spark.glm works on collinear data - A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2) - b <- c(1, 2, 3, 4) - data <- as.data.frame(cbind(A, b)) - df <- createDataFrame(data) - stats <- summary(spark.glm(df, b ~ . - 1)) - coefs <- unlist(stats$coefficients) - expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4)) -}) - -test_that("spark.glm save/load", { - training <- suppressWarnings(createDataFrame(iris)) - m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) - s <- summary(m) - - modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp") - write.ml(m, modelPath) - expect_error(write.ml(m, modelPath)) - write.ml(m, modelPath, overwrite = TRUE) - m2 <- read.ml(modelPath) - s2 <- summary(m2) - - expect_equal(s$coefficients, s2$coefficients) - expect_equal(rownames(s$coefficients), rownames(s2$coefficients)) - expect_equal(s$dispersion, s2$dispersion) - expect_equal(s$null.deviance, s2$null.deviance) - expect_equal(s$deviance, s2$deviance) - expect_equal(s$df.null, s2$df.null) - expect_equal(s$df.residual, s2$df.residual) - expect_equal(s$aic, s2$aic) - expect_equal(s$iter, s2$iter) - expect_true(!s$is.loaded) - expect_true(s2$is.loaded) - - unlink(modelPath) -}) - - - -test_that("formula of glm", { - training <- suppressWarnings(createDataFrame(iris)) - # dot minus and intercept vs native glm - model <- glm(Sepal_Width ~ . - Species + 0, data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # feature interaction vs native glm - model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # glm should work with long formula - training <- suppressWarnings(createDataFrame(iris)) - training$LongLongLongLongLongName <- training$Sepal_Width - training$VeryLongLongLongLonLongName <- training$Sepal_Length - training$AnotherLongLongLongLongName <- training$Species - model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName, - data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - -test_that("glm and predict", { - training <- suppressWarnings(createDataFrame(iris)) - # gaussian family - model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # poisson family - model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, - family = poisson(link = identity)) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, - data = iris, family = poisson(link = identity)), iris)) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # Test stats::predict is working - x <- rnorm(15) - y <- x + rnorm(15) - expect_equal(length(predict(lm(y ~ x))), 15) -}) - -test_that("glm summary", { - # gaussian family - training <- suppressWarnings(createDataFrame(iris)) - stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training)) - - rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-4)) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - - # binomial family - df <- suppressWarnings(createDataFrame(iris)) - training <- df[df$Species %in% c("versicolor", "virginica"), ] - stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, - family = binomial(link = "logit"))) - - rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] - rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, - family = binomial(link = "logit"))) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-4)) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "Sepal_Length", "Sepal_Width"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - - # Test summary works on base GLM models - baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) - baseSummary <- summary(baseModel) - expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) -}) - -test_that("glm save/load", { - training <- suppressWarnings(createDataFrame(iris)) - m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) - s <- summary(m) - - modelPath <- tempfile(pattern = "glm", fileext = ".tmp") - write.ml(m, modelPath) - expect_error(write.ml(m, modelPath)) - write.ml(m, modelPath, overwrite = TRUE) - m2 <- read.ml(modelPath) - s2 <- summary(m2) - - expect_equal(s$coefficients, s2$coefficients) - expect_equal(rownames(s$coefficients), rownames(s2$coefficients)) - expect_equal(s$dispersion, s2$dispersion) - expect_equal(s$null.deviance, s2$null.deviance) - expect_equal(s$deviance, s2$deviance) - expect_equal(s$df.null, s2$df.null) - expect_equal(s$df.residual, s2$df.residual) - expect_equal(s$aic, s2$aic) - expect_equal(s$iter, s2$iter) - expect_true(!s$is.loaded) - expect_true(s2$is.loaded) - - unlink(modelPath) -}) - -test_that("spark.kmeans", { - newIris <- iris - newIris$Species <- NULL - training <- suppressWarnings(createDataFrame(newIris)) - - take(training, 1) - - model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random") - sample <- take(select(predict(model, training), "prediction"), 1) - expect_equal(typeof(sample$prediction), "integer") - expect_equal(sample$prediction, 1) - - # Test stats::kmeans is working - statsModel <- kmeans(x = newIris, centers = 2) - expect_equal(sort(unique(statsModel$cluster)), c(1, 2)) - - # Test fitted works on KMeans - fitted.model <- fitted(model) - expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1)) - - # Test summary works on KMeans - summary.model <- summary(model) - cluster <- summary.model$cluster - k <- summary.model$k - expect_equal(k, 2) - expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1)) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - summary2 <- summary(model2) - expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size))) - expect_equal(summary.model$coefficients, summary2$coefficients) - expect_true(!summary.model$is.loaded) - expect_true(summary2$is.loaded) - - unlink(modelPath) -}) - -test_that("spark.mlp", { - df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), - source = "libsvm") - model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), - solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1) - - # Test summary method - summary <- summary(model) - expect_equal(summary$numOfInputs, 4) - expect_equal(summary$numOfOutputs, 3) - expect_equal(summary$layers, c(4, 5, 4, 3)) - expect_equal(length(summary$weights), 64) - expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825), - tolerance = 1e-6) - - # Test predict method - mlpTestDF <- df - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0")) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - summary2 <- summary(model2) - - expect_equal(summary2$numOfInputs, 4) - expect_equal(summary2$numOfOutputs, 3) - expect_equal(summary2$layers, c(4, 5, 4, 3)) - expect_equal(length(summary2$weights), 64) - - unlink(modelPath) - - # Test default parameter - model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3)) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) - - # Test illegal parameter - expect_error(spark.mlp(df, label ~ features, layers = NULL), - "layers must be a integer vector with length > 1.") - expect_error(spark.mlp(df, label ~ features, layers = c()), - "layers must be a integer vector with length > 1.") - expect_error(spark.mlp(df, label ~ features, layers = c(3)), - "layers must be a integer vector with length > 1.") - - # Test random seed - # default seed - model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) - # seed equals 10 - model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) - - # test initialWeights - model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights = - c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0")) - - model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights = - c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0)) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0")) - - model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2) - mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 10), - c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0")) - - # Test formula works well - df <- suppressWarnings(createDataFrame(iris)) - model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, - layers = c(4, 3)) - summary <- summary(model) - expect_equal(summary$numOfInputs, 4) - expect_equal(summary$numOfOutputs, 3) - expect_equal(summary$layers, c(4, 3)) - expect_equal(length(summary$weights), 15) - expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413, - -10.2376130), tolerance = 1e-6) -}) - -test_that("spark.naiveBayes", { - # R code to reproduce the result. - # We do not support instance weights yet. So we ignore the frequencies. - # - #' library(e1071) - #' t <- as.data.frame(Titanic) - #' t1 <- t[t$Freq > 0, -5] - #' m <- naiveBayes(Survived ~ ., data = t1) - #' m - #' predict(m, t1) - # - # -- output of 'm' - # - # A-priori probabilities: - # Y - # No Yes - # 0.4166667 0.5833333 - # - # Conditional probabilities: - # Class - # Y 1st 2nd 3rd Crew - # No 0.2000000 0.2000000 0.4000000 0.2000000 - # Yes 0.2857143 0.2857143 0.2857143 0.1428571 - # - # Sex - # Y Male Female - # No 0.5 0.5 - # Yes 0.5 0.5 - # - # Age - # Y Child Adult - # No 0.2000000 0.8000000 - # Yes 0.4285714 0.5714286 - # - # -- output of 'predict(m, t1)' - # - # Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No - # - - t <- as.data.frame(Titanic) - t1 <- t[t$Freq > 0, -5] - df <- suppressWarnings(createDataFrame(t1)) - m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0) - s <- summary(m) - expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6) - expect_equal(sum(s$apriori), 1) - expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6) - p <- collect(select(predict(m, df), "prediction")) - expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No", - "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", - "Yes", "Yes", "No", "No")) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp") - write.ml(m, modelPath) - expect_error(write.ml(m, modelPath)) - write.ml(m, modelPath, overwrite = TRUE) - m2 <- read.ml(modelPath) - s2 <- summary(m2) - expect_equal(s$apriori, s2$apriori) - expect_equal(s$tables, s2$tables) - - unlink(modelPath) - - # Test e1071::naiveBayes - if (requireNamespace("e1071", quietly = TRUE)) { - expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA) - expect_equal(as.character(predict(m, t1[1, ])), "Yes") - } - - # Test numeric response variable - t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1) - t2 <- t1[-4] - df <- suppressWarnings(createDataFrame(t2)) - m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0) - s <- summary(m) - expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6) - expect_equal(sum(s$apriori), 1) - expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6) -}) - -test_that("spark.survreg", { - # R code to reproduce the result. - # - #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), - #' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) - #' library(survival) - #' model <- survreg(Surv(time, status) ~ x + sex, rData) - #' summary(model) - #' predict(model, data) - # - # -- output of 'summary(model)' - # - # Value Std. Error z p - # (Intercept) 1.315 0.270 4.88 1.07e-06 - # x -0.190 0.173 -1.10 2.72e-01 - # sex -0.253 0.329 -0.77 4.42e-01 - # Log(scale) -1.160 0.396 -2.93 3.41e-03 - # - # -- output of 'predict(model, data)' - # - # 1 2 3 4 5 6 7 - # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269 - # - data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0), - list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1)) - df <- createDataFrame(data, c("time", "status", "x", "sex")) - model <- spark.survreg(df, Surv(time, status) ~ x + sex) - stats <- summary(model) - coefs <- as.vector(stats$coefficients[, 1]) - rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800) - expect_equal(coefs, rCoefs, tolerance = 1e-4) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "x", "sex", "Log(scale)"))) - p <- collect(select(predict(model, df), "prediction")) - expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035, - 2.390146, 2.891269, 2.891269), tolerance = 1e-4) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - coefs2 <- as.vector(stats2$coefficients[, 1]) - expect_equal(coefs, coefs2) - expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients)) - - unlink(modelPath) - - # Test survival::survreg - if (requireNamespace("survival", quietly = TRUE)) { - rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), - x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) - expect_error( - model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData), - NA) - expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4) - } -}) - -test_that("spark.isotonicRegression", { - label <- c(7.0, 5.0, 3.0, 5.0, 1.0) - feature <- c(0.0, 1.0, 2.0, 3.0, 4.0) - weight <- c(1.0, 1.0, 1.0, 1.0, 1.0) - data <- as.data.frame(cbind(label, feature, weight)) - df <- createDataFrame(data) - - model <- spark.isoreg(df, label ~ feature, isotonic = FALSE, - weightCol = "weight") - # only allow one variable on the right hand side of the formula - expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE)) - result <- summary(model) - expect_equal(result$predictions, list(7, 5, 4, 4, 1)) - - # Test model prediction - predict_data <- list(list(-2.0), list(-1.0), list(0.5), - list(0.75), list(1.0), list(2.0), list(9.0)) - predict_df <- createDataFrame(predict_data, c("feature")) - predict_result <- collect(select(predict(model, predict_df), "prediction")) - expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0)) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-isotonicRegression", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - expect_equal(result, summary(model2)) - - unlink(modelPath) -}) - -test_that("spark.logit", { - # R code to reproduce the result. - # nolint start - #' library(glmnet) - #' iris.x = as.matrix(iris[, 1:4]) - #' iris.y = as.factor(as.character(iris[, 5])) - #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) - #' coef(logit) - # - # $setosa - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # 1.0981324 - # Sepal.Length -0.2909860 - # Sepal.Width 0.5510907 - # Petal.Length -0.1915217 - # Petal.Width -0.4211946 - # - # $versicolor - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # 1.520061e+00 - # Sepal.Length 2.524501e-02 - # Sepal.Width -5.310313e-01 - # Petal.Length 3.656543e-02 - # Petal.Width -3.144464e-05 - # - # $virginica - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # -2.61819385 - # Sepal.Length 0.26574097 - # Sepal.Width -0.02005932 - # Petal.Length 0.15495629 - # Petal.Width 0.42122607 - # nolint end - - # Test multinomial logistic regression againt three classes - df <- suppressWarnings(createDataFrame(iris)) - model <- spark.logit(df, Species ~ ., regParam = 0.5) - summary <- summary(model) - versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00) - virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42) - setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42) - versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) - virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) - setosaCoefs <- unlist(summary$coefficients[, "setosa"]) - expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) - expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) - expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) - - # Test model save and load - modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - coefs <- summary(model)$coefficients - coefs2 <- summary(model2)$coefficients - expect_equal(coefs, coefs2) - unlink(modelPath) - - # R code to reproduce the result. - # nolint start - #' library(glmnet) - #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ] - #' iris.x = as.matrix(iris2[, 1:4]) - #' iris.y = as.factor(as.character(iris2[, 5])) - #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) - #' coef(logit) - # - # $versicolor - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # 3.93844796 - # Sepal.Length -0.13538675 - # Sepal.Width -0.02386443 - # Petal.Length -0.35076451 - # Petal.Width -0.77971954 - # - # $virginica - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # -3.93844796 - # Sepal.Length 0.13538675 - # Sepal.Width 0.02386443 - # Petal.Length 0.35076451 - # Petal.Width 0.77971954 - # - #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5) - #' coef(logit) - # - # 5 x 1 sparse Matrix of class "dgCMatrix" - # s0 - # (Intercept) -6.0824412 - # Sepal.Length 0.2458260 - # Sepal.Width 0.1642093 - # Petal.Length 0.4759487 - # Petal.Width 1.0383948 - # - # nolint end - - # Test multinomial logistic regression againt two classes - df <- suppressWarnings(createDataFrame(iris)) - training <- df[df$Species %in% c("versicolor", "virginica"), ] - model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial") - summary <- summary(model) - versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78) - virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78) - versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) - virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) - expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) - expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) - - # Test binomial logistic regression againt two classes - model <- spark.logit(training, Species ~ ., regParam = 0.5) - summary <- summary(model) - coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04) - coefs <- unlist(summary$coefficients[, "Estimate"]) - expect_true(all(abs(coefsR - coefs) < 0.1)) - - # Test prediction with string label - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") - expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor", - "versicolor", "versicolor", "versicolor", "versicolor", "versicolor") - expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected) - - # Test prediction with numeric label - label <- c(0.0, 0.0, 0.0, 1.0, 1.0) - feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) - data <- as.data.frame(cbind(label, feature)) - df <- createDataFrame(data) - model <- spark.logit(df, label ~ feature) - prediction <- collect(select(predict(model, df), "prediction")) - expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0")) -}) - -test_that("spark.gaussianMixture", { - # R code to reproduce the result. - # nolint start - #' library(mvtnorm) - #' set.seed(1) - #' a <- rmvnorm(7, c(0, 0)) - #' b <- rmvnorm(8, c(10, 10)) - #' data <- rbind(a, b) - #' model <- mvnormalmixEM(data, k = 2) - #' model$lambda - # - # [1] 0.4666667 0.5333333 - # - #' model$mu - # - # [1] 0.11731091 -0.06192351 - # [1] 10.363673 9.897081 - # - #' model$sigma - # - # [[1]] - # [,1] [,2] - # [1,] 0.62049934 0.06880802 - # [2,] 0.06880802 1.27431874 - # - # [[2]] - # [,1] [,2] - # [1,] 0.2961543 0.160783 - # [2,] 0.1607830 1.008878 - # nolint end - data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808), - list(0.3295078, -0.8204684), list(0.4874291, 0.7383247), - list(0.5757814, -0.3053884), list(1.5117812, 0.3898432), - list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664), - list(9.9838097, 10.9438362), list(10.8212212, 10.5939013), - list(10.9189774, 10.7821363), list(10.0745650, 8.0106483), - list(10.6198257, 9.9438713), list(9.8442045, 8.5292476), - list(9.5218499, 10.4179416)) - df <- createDataFrame(data, c("x1", "x2")) - model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2) - stats <- summary(model) - rLambda <- c(0.4666667, 0.5333333) - rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081) - rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874, - 0.2961543, 0.160783, 0.1607830, 1.008878) - expect_equal(stats$lambda, rLambda, tolerance = 1e-3) - expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3) - expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3) - p <- collect(select(predict(model, df), "prediction")) - expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats$lambda, stats2$lambda) - expect_equal(unlist(stats$mu), unlist(stats2$mu)) - expect_equal(unlist(stats$sigma), unlist(stats2$sigma)) - - unlink(modelPath) -}) - -test_that("spark.lda with libsvm", { - text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm") - model <- spark.lda(text, optimizer = "em") - - stats <- summary(model, 10) - isDistributed <- stats$isDistributed - logLikelihood <- stats$logLikelihood - logPerplexity <- stats$logPerplexity - vocabSize <- stats$vocabSize - topics <- stats$topicTopTerms - weights <- stats$topicTopTermsWeights - vocabulary <- stats$vocabulary - - expect_false(isDistributed) - expect_true(logLikelihood <= 0 & is.finite(logLikelihood)) - expect_true(logPerplexity >= 0 & is.finite(logPerplexity)) - expect_equal(vocabSize, 11) - expect_true(is.null(vocabulary)) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - - expect_false(stats2$isDistributed) - expect_equal(logLikelihood, stats2$logLikelihood) - expect_equal(logPerplexity, stats2$logPerplexity) - expect_equal(vocabSize, stats2$vocabSize) - expect_equal(vocabulary, stats2$vocabulary) - - unlink(modelPath) -}) - -test_that("spark.lda with text input", { - text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt")) - model <- spark.lda(text, optimizer = "online", features = "value") - - stats <- summary(model) - isDistributed <- stats$isDistributed - logLikelihood <- stats$logLikelihood - logPerplexity <- stats$logPerplexity - vocabSize <- stats$vocabSize - topics <- stats$topicTopTerms - weights <- stats$topicTopTermsWeights - vocabulary <- stats$vocabulary - - expect_false(isDistributed) - expect_true(logLikelihood <= 0 & is.finite(logLikelihood)) - expect_true(logPerplexity >= 0 & is.finite(logPerplexity)) - expect_equal(vocabSize, 10) - expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"))) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - - expect_false(stats2$isDistributed) - expect_equal(logLikelihood, stats2$logLikelihood) - expect_equal(logPerplexity, stats2$logPerplexity) - expect_equal(vocabSize, stats2$vocabSize) - expect_true(all.equal(vocabulary, stats2$vocabulary)) - - unlink(modelPath) -}) - -test_that("spark.posterior and spark.perplexity", { - text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt")) - model <- spark.lda(text, features = "value", k = 3) - - # Assert perplexities are equal - stats <- summary(model) - logPerplexity <- spark.perplexity(model, text) - expect_equal(logPerplexity, stats$logPerplexity) - - # Assert the sum of every topic distribution is equal to 1 - posterior <- spark.posterior(model, text) - local.posterior <- collect(posterior)$topicDistribution - expect_equal(length(local.posterior), sum(unlist(local.posterior))) -}) - -test_that("spark.als", { - data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), - list(2, 1, 1.0), list(2, 2, 5.0)) - df <- createDataFrame(data, c("user", "item", "score")) - model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", - rank = 10, maxIter = 5, seed = 0, regParam = 0.1) - stats <- summary(model) - expect_equal(stats$rank, 10) - test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item")) - predictions <- collect(predict(model, test)) - - expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409), - tolerance = 1e-4) - - # Test model save/load - modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats2$rating, "score") - userFactors <- collect(stats$userFactors) - itemFactors <- collect(stats$itemFactors) - userFactors2 <- collect(stats2$userFactors) - itemFactors2 <- collect(stats2$itemFactors) - - orderUser <- order(userFactors$id) - orderUser2 <- order(userFactors2$id) - expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2]) - expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2]) - - orderItem <- order(itemFactors$id) - orderItem2 <- order(itemFactors2$id) - expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2]) - expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2]) - - unlink(modelPath) -}) - -test_that("spark.kstest", { - data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5)) - df <- createDataFrame(data) - testResult <- spark.kstest(df, "test", "norm") - stats <- summary(testResult) - - rStats <- ks.test(data$test, "pnorm", alternative = "two.sided") - - expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) - expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) - expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") - - testResult <- spark.kstest(df, "test", "norm", -0.5) - stats <- summary(testResult) - - rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided") - - expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) - expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) - expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") - - # Test print.summary.KSTest - printStats <- capture.output(print.summary.KSTest(stats)) - expect_match(printStats[1], "Kolmogorov-Smirnov test summary:") - expect_match(printStats[5], - "Low presumption against null hypothesis: Sample follows theoretical distribution. ") -}) - -test_that("spark.randomForest", { - # regression - data <- suppressWarnings(createDataFrame(longley)) - model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, - numTrees = 1) - - predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187, - 63.221, 63.639, 64.989, 63.761, - 66.019, 67.857, 68.169, 66.513, - 68.655, 69.564, 69.331, 70.551), - tolerance = 1e-4) - - stats <- summary(model) - expect_equal(stats$numTrees, 1) - expect_error(capture.output(stats), NA) - expect_true(length(capture.output(stats)) > 6) - - model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, - numTrees = 20, seed = 123) - predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070, - 63.53160, 64.05470, 65.12710, 64.30450, - 66.70910, 67.86125, 68.08700, 67.21865, - 68.89275, 69.53180, 69.39640, 69.68250), - - tolerance = 1e-4) - stats <- summary(model) - expect_equal(stats$numTrees, 20) - - modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats$formula, stats2$formula) - expect_equal(stats$numFeatures, stats2$numFeatures) - expect_equal(stats$features, stats2$features) - expect_equal(stats$featureImportances, stats2$featureImportances) - expect_equal(stats$numTrees, stats2$numTrees) - expect_equal(stats$treeWeights, stats2$treeWeights) - - unlink(modelPath) - - # classification - data <- suppressWarnings(createDataFrame(iris)) - model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification", - maxDepth = 5, maxBins = 16) - - stats <- summary(model) - expect_equal(stats$numFeatures, 2) - expect_equal(stats$numTrees, 20) - expect_error(capture.output(stats), NA) - expect_true(length(capture.output(stats)) > 6) - # Test string prediction values - predictions <- collect(predict(model, data))$prediction - expect_equal(length(grep("setosa", predictions)), 50) - expect_equal(length(grep("versicolor", predictions)), 50) - - modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats$depth, stats2$depth) - expect_equal(stats$numNodes, stats2$numNodes) - expect_equal(stats$numClasses, stats2$numClasses) - - unlink(modelPath) - - # Test numeric response variable - labelToIndex <- function(species) { - switch(as.character(species), - setosa = 0.0, - versicolor = 1.0, - virginica = 2.0 - ) - } - iris$NumericSpecies <- lapply(iris$Species, labelToIndex) - data <- suppressWarnings(createDataFrame(iris[-5])) - model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification", - maxDepth = 5, maxBins = 16) - stats <- summary(model) - expect_equal(stats$numFeatures, 2) - expect_equal(stats$numTrees, 20) - # Test numeric prediction values - predictions <- collect(predict(model, data))$prediction - expect_equal(length(grep("1.0", predictions)), 50) - expect_equal(length(grep("2.0", predictions)), 50) - - # spark.randomForest classification can work on libsvm data - data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), - source = "libsvm") - model <- spark.randomForest(data, label ~ features, "classification") - expect_equal(summary(model)$numFeatures, 4) -}) - -test_that("spark.gbt", { - # regression - data <- suppressWarnings(createDataFrame(longley)) - model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123) - predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187, - 63.221, 63.639, 64.989, 63.761, - 66.019, 67.857, 68.169, 66.513, - 68.655, 69.564, 69.331, 70.551), - tolerance = 1e-4) - stats <- summary(model) - expect_equal(stats$numTrees, 20) - expect_equal(stats$formula, "Employed ~ .") - expect_equal(stats$numFeatures, 6) - expect_equal(length(stats$treeWeights), 20) - - modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats$formula, stats2$formula) - expect_equal(stats$numFeatures, stats2$numFeatures) - expect_equal(stats$features, stats2$features) - expect_equal(stats$featureImportances, stats2$featureImportances) - expect_equal(stats$numTrees, stats2$numTrees) - expect_equal(stats$treeWeights, stats2$treeWeights) - - unlink(modelPath) - - # classification - # label must be binary - GBTClassifier currently only supports binary classification. - iris2 <- iris[iris$Species != "virginica", ] - data <- suppressWarnings(createDataFrame(iris2)) - model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification") - stats <- summary(model) - expect_equal(stats$numFeatures, 2) - expect_equal(stats$numTrees, 20) - expect_error(capture.output(stats), NA) - expect_true(length(capture.output(stats)) > 6) - predictions <- collect(predict(model, data))$prediction - # test string prediction values - expect_equal(length(grep("setosa", predictions)), 50) - expect_equal(length(grep("versicolor", predictions)), 50) - - modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - stats2 <- summary(model2) - expect_equal(stats$depth, stats2$depth) - expect_equal(stats$numNodes, stats2$numNodes) - expect_equal(stats$numClasses, stats2$numClasses) - - unlink(modelPath) - - iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1) - df <- suppressWarnings(createDataFrame(iris2)) - m <- spark.gbt(df, NumericSpecies ~ ., type = "classification") - s <- summary(m) - # test numeric prediction values - expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction)) - expect_equal(s$numFeatures, 5) - expect_equal(s$numTrees, 20) - - # spark.gbt classification can work on libsvm data - data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"), - source = "libsvm") - model <- spark.gbt(data, label ~ features, "classification") - expect_equal(summary(model)$numFeatures, 692) -}) - -sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R new file mode 100644 index 0000000000000..2e0dea321e7bf --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -0,0 +1,341 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib classification algorithms, except for tree-based algorithms") + +# Tests for MLlib classification algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +absoluteSparkPath <- function(x) { + sparkHome <- sparkR.conf("spark.home") + file.path(sparkHome, x) +} + +test_that("spark.logit", { + # R code to reproduce the result. + # nolint start + #' library(glmnet) + #' iris.x = as.matrix(iris[, 1:4]) + #' iris.y = as.factor(as.character(iris[, 5])) + #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # $setosa + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 1.0981324 + # Sepal.Length -0.2909860 + # Sepal.Width 0.5510907 + # Petal.Length -0.1915217 + # Petal.Width -0.4211946 + # + # $versicolor + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 1.520061e+00 + # Sepal.Length 2.524501e-02 + # Sepal.Width -5.310313e-01 + # Petal.Length 3.656543e-02 + # Petal.Width -3.144464e-05 + # + # $virginica + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # -2.61819385 + # Sepal.Length 0.26574097 + # Sepal.Width -0.02005932 + # Petal.Length 0.15495629 + # Petal.Width 0.42122607 + # nolint end + + # Test multinomial logistic regression againt three classes + df <- suppressWarnings(createDataFrame(iris)) + model <- spark.logit(df, Species ~ ., regParam = 0.5) + summary <- summary(model) + versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00) + virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42) + setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42) + versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) + virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) + setosaCoefs <- unlist(summary$coefficients[, "setosa"]) + expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) + expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) + expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) + + # Test model save and load + modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + coefs <- summary(model)$coefficients + coefs2 <- summary(model2)$coefficients + expect_equal(coefs, coefs2) + unlink(modelPath) + + # R code to reproduce the result. + # nolint start + #' library(glmnet) + #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ] + #' iris.x = as.matrix(iris2[, 1:4]) + #' iris.y = as.factor(as.character(iris2[, 5])) + #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # $versicolor + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 3.93844796 + # Sepal.Length -0.13538675 + # Sepal.Width -0.02386443 + # Petal.Length -0.35076451 + # Petal.Width -0.77971954 + # + # $virginica + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # -3.93844796 + # Sepal.Length 0.13538675 + # Sepal.Width 0.02386443 + # Petal.Length 0.35076451 + # Petal.Width 0.77971954 + # + #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # (Intercept) -6.0824412 + # Sepal.Length 0.2458260 + # Sepal.Width 0.1642093 + # Petal.Length 0.4759487 + # Petal.Width 1.0383948 + # + # nolint end + + # Test multinomial logistic regression againt two classes + df <- suppressWarnings(createDataFrame(iris)) + training <- df[df$Species %in% c("versicolor", "virginica"), ] + model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial") + summary <- summary(model) + versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78) + virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78) + versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) + virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) + expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) + expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) + + # Test binomial logistic regression againt two classes + model <- spark.logit(training, Species ~ ., regParam = 0.5) + summary <- summary(model) + coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04) + coefs <- unlist(summary$coefficients[, "Estimate"]) + expect_true(all(abs(coefsR - coefs) < 0.1)) + + # Test prediction with string label + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") + expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor", + "versicolor", "versicolor", "versicolor", "versicolor", "versicolor") + expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected) + + # Test prediction with numeric label + label <- c(0.0, 0.0, 0.0, 1.0, 1.0) + feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) + data <- as.data.frame(cbind(label, feature)) + df <- createDataFrame(data) + model <- spark.logit(df, label ~ feature) + prediction <- collect(select(predict(model, df), "prediction")) + expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0")) +}) + +test_that("spark.mlp", { + df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), + source = "libsvm") + model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), + solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1) + + # Test summary method + summary <- summary(model) + expect_equal(summary$numOfInputs, 4) + expect_equal(summary$numOfOutputs, 3) + expect_equal(summary$layers, c(4, 5, 4, 3)) + expect_equal(length(summary$weights), 64) + expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825), + tolerance = 1e-6) + + # Test predict method + mlpTestDF <- df + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0")) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + summary2 <- summary(model2) + + expect_equal(summary2$numOfInputs, 4) + expect_equal(summary2$numOfOutputs, 3) + expect_equal(summary2$layers, c(4, 5, 4, 3)) + expect_equal(length(summary2$weights), 64) + + unlink(modelPath) + + # Test default parameter + model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3)) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) + + # Test illegal parameter + expect_error(spark.mlp(df, label ~ features, layers = NULL), + "layers must be a integer vector with length > 1.") + expect_error(spark.mlp(df, label ~ features, layers = c()), + "layers must be a integer vector with length > 1.") + expect_error(spark.mlp(df, label ~ features, layers = c(3)), + "layers must be a integer vector with length > 1.") + + # Test random seed + # default seed + model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) + # seed equals 10 + model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) + + # test initialWeights + model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights = + c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0")) + + model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights = + c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0)) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0")) + + model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), + c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0")) + + # Test formula works well + df <- suppressWarnings(createDataFrame(iris)) + model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, + layers = c(4, 3)) + summary <- summary(model) + expect_equal(summary$numOfInputs, 4) + expect_equal(summary$numOfOutputs, 3) + expect_equal(summary$layers, c(4, 3)) + expect_equal(length(summary$weights), 15) + expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413, + -10.2376130), tolerance = 1e-6) +}) + +test_that("spark.naiveBayes", { + # R code to reproduce the result. + # We do not support instance weights yet. So we ignore the frequencies. + # + #' library(e1071) + #' t <- as.data.frame(Titanic) + #' t1 <- t[t$Freq > 0, -5] + #' m <- naiveBayes(Survived ~ ., data = t1) + #' m + #' predict(m, t1) + # + # -- output of 'm' + # + # A-priori probabilities: + # Y + # No Yes + # 0.4166667 0.5833333 + # + # Conditional probabilities: + # Class + # Y 1st 2nd 3rd Crew + # No 0.2000000 0.2000000 0.4000000 0.2000000 + # Yes 0.2857143 0.2857143 0.2857143 0.1428571 + # + # Sex + # Y Male Female + # No 0.5 0.5 + # Yes 0.5 0.5 + # + # Age + # Y Child Adult + # No 0.2000000 0.8000000 + # Yes 0.4285714 0.5714286 + # + # -- output of 'predict(m, t1)' + # + # Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No + # + + t <- as.data.frame(Titanic) + t1 <- t[t$Freq > 0, -5] + df <- suppressWarnings(createDataFrame(t1)) + m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0) + s <- summary(m) + expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6) + expect_equal(sum(s$apriori), 1) + expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6) + p <- collect(select(predict(m, df), "prediction")) + expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No", + "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", + "Yes", "Yes", "No", "No")) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp") + write.ml(m, modelPath) + expect_error(write.ml(m, modelPath)) + write.ml(m, modelPath, overwrite = TRUE) + m2 <- read.ml(modelPath) + s2 <- summary(m2) + expect_equal(s$apriori, s2$apriori) + expect_equal(s$tables, s2$tables) + + unlink(modelPath) + + # Test e1071::naiveBayes + if (requireNamespace("e1071", quietly = TRUE)) { + expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA) + expect_equal(as.character(predict(m, t1[1, ])), "Yes") + } + + # Test numeric response variable + t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1) + t2 <- t1[-4] + df <- suppressWarnings(createDataFrame(t2)) + m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0) + s <- summary(m) + expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6) + expect_equal(sum(s$apriori), 1) + expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6) +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R new file mode 100644 index 0000000000000..1980fffd80cc6 --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R @@ -0,0 +1,224 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib clustering algorithms") + +# Tests for MLlib clustering algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +absoluteSparkPath <- function(x) { + sparkHome <- sparkR.conf("spark.home") + file.path(sparkHome, x) +} + +test_that("spark.gaussianMixture", { + # R code to reproduce the result. + # nolint start + #' library(mvtnorm) + #' set.seed(1) + #' a <- rmvnorm(7, c(0, 0)) + #' b <- rmvnorm(8, c(10, 10)) + #' data <- rbind(a, b) + #' model <- mvnormalmixEM(data, k = 2) + #' model$lambda + # + # [1] 0.4666667 0.5333333 + # + #' model$mu + # + # [1] 0.11731091 -0.06192351 + # [1] 10.363673 9.897081 + # + #' model$sigma + # + # [[1]] + # [,1] [,2] + # [1,] 0.62049934 0.06880802 + # [2,] 0.06880802 1.27431874 + # + # [[2]] + # [,1] [,2] + # [1,] 0.2961543 0.160783 + # [2,] 0.1607830 1.008878 + # nolint end + data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808), + list(0.3295078, -0.8204684), list(0.4874291, 0.7383247), + list(0.5757814, -0.3053884), list(1.5117812, 0.3898432), + list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664), + list(9.9838097, 10.9438362), list(10.8212212, 10.5939013), + list(10.9189774, 10.7821363), list(10.0745650, 8.0106483), + list(10.6198257, 9.9438713), list(9.8442045, 8.5292476), + list(9.5218499, 10.4179416)) + df <- createDataFrame(data, c("x1", "x2")) + model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2) + stats <- summary(model) + rLambda <- c(0.4666667, 0.5333333) + rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081) + rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874, + 0.2961543, 0.160783, 0.1607830, 1.008878) + expect_equal(stats$lambda, rLambda, tolerance = 1e-3) + expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3) + expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3) + p <- collect(select(predict(model, df), "prediction")) + expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats$lambda, stats2$lambda) + expect_equal(unlist(stats$mu), unlist(stats2$mu)) + expect_equal(unlist(stats$sigma), unlist(stats2$sigma)) + + unlink(modelPath) +}) + +test_that("spark.kmeans", { + newIris <- iris + newIris$Species <- NULL + training <- suppressWarnings(createDataFrame(newIris)) + + take(training, 1) + + model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random") + sample <- take(select(predict(model, training), "prediction"), 1) + expect_equal(typeof(sample$prediction), "integer") + expect_equal(sample$prediction, 1) + + # Test stats::kmeans is working + statsModel <- kmeans(x = newIris, centers = 2) + expect_equal(sort(unique(statsModel$cluster)), c(1, 2)) + + # Test fitted works on KMeans + fitted.model <- fitted(model) + expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1)) + + # Test summary works on KMeans + summary.model <- summary(model) + cluster <- summary.model$cluster + k <- summary.model$k + expect_equal(k, 2) + expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1)) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + summary2 <- summary(model2) + expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size))) + expect_equal(summary.model$coefficients, summary2$coefficients) + expect_true(!summary.model$is.loaded) + expect_true(summary2$is.loaded) + + unlink(modelPath) +}) + +test_that("spark.lda with libsvm", { + text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm") + model <- spark.lda(text, optimizer = "em") + + stats <- summary(model, 10) + isDistributed <- stats$isDistributed + logLikelihood <- stats$logLikelihood + logPerplexity <- stats$logPerplexity + vocabSize <- stats$vocabSize + topics <- stats$topicTopTerms + weights <- stats$topicTopTermsWeights + vocabulary <- stats$vocabulary + + expect_false(isDistributed) + expect_true(logLikelihood <= 0 & is.finite(logLikelihood)) + expect_true(logPerplexity >= 0 & is.finite(logPerplexity)) + expect_equal(vocabSize, 11) + expect_true(is.null(vocabulary)) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + + expect_false(stats2$isDistributed) + expect_equal(logLikelihood, stats2$logLikelihood) + expect_equal(logPerplexity, stats2$logPerplexity) + expect_equal(vocabSize, stats2$vocabSize) + expect_equal(vocabulary, stats2$vocabulary) + + unlink(modelPath) +}) + +test_that("spark.lda with text input", { + text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt")) + model <- spark.lda(text, optimizer = "online", features = "value") + + stats <- summary(model) + isDistributed <- stats$isDistributed + logLikelihood <- stats$logLikelihood + logPerplexity <- stats$logPerplexity + vocabSize <- stats$vocabSize + topics <- stats$topicTopTerms + weights <- stats$topicTopTermsWeights + vocabulary <- stats$vocabulary + + expect_false(isDistributed) + expect_true(logLikelihood <= 0 & is.finite(logLikelihood)) + expect_true(logPerplexity >= 0 & is.finite(logPerplexity)) + expect_equal(vocabSize, 10) + expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"))) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + + expect_false(stats2$isDistributed) + expect_equal(logLikelihood, stats2$logLikelihood) + expect_equal(logPerplexity, stats2$logPerplexity) + expect_equal(vocabSize, stats2$vocabSize) + expect_true(all.equal(vocabulary, stats2$vocabulary)) + + unlink(modelPath) +}) + +test_that("spark.posterior and spark.perplexity", { + text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt")) + model <- spark.lda(text, features = "value", k = 3) + + # Assert perplexities are equal + stats <- summary(model) + logPerplexity <- spark.perplexity(model, text) + expect_equal(logPerplexity, stats$logPerplexity) + + # Assert the sum of every topic distribution is equal to 1 + posterior <- spark.posterior(model, text) + local.posterior <- collect(posterior)$topicDistribution + expect_equal(length(local.posterior), sum(unlist(local.posterior))) +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_recommendation.R b/R/pkg/inst/tests/testthat/test_mllib_recommendation.R new file mode 100644 index 0000000000000..6b1040db93050 --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_recommendation.R @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib recommendation algorithms") + +# Tests for MLlib recommendation algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +test_that("spark.als", { + data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), + list(2, 1, 1.0), list(2, 2, 5.0)) + df <- createDataFrame(data, c("user", "item", "score")) + model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", + rank = 10, maxIter = 5, seed = 0, regParam = 0.1) + stats <- summary(model) + expect_equal(stats$rank, 10) + test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item")) + predictions <- collect(predict(model, test)) + + expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409), + tolerance = 1e-4) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats2$rating, "score") + userFactors <- collect(stats$userFactors) + itemFactors <- collect(stats$itemFactors) + userFactors2 <- collect(stats2$userFactors) + itemFactors2 <- collect(stats2$itemFactors) + + orderUser <- order(userFactors$id) + orderUser2 <- order(userFactors2$id) + expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2]) + expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2]) + + orderItem <- order(itemFactors$id) + orderItem2 <- order(itemFactors2$id) + expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2]) + expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2]) + + unlink(modelPath) +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R new file mode 100644 index 0000000000000..e20dafa4147af --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -0,0 +1,417 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib regression algorithms, except for tree-based algorithms") + +# Tests for MLlib regression algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +test_that("formula of spark.glm", { + training <- suppressWarnings(createDataFrame(iris)) + # directly calling the spark API + # dot minus and intercept vs native glm + model <- spark.glm(training, Sepal_Width ~ . - Species + 0) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # feature interaction vs native glm + model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # glm should work with long formula + training <- suppressWarnings(createDataFrame(iris)) + training$LongLongLongLongLongName <- training$Sepal_Width + training$VeryLongLongLongLonLongName <- training$Sepal_Length + training$AnotherLongLongLongLongName <- training$Species + model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName + + AnotherLongLongLongLongName) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) +}) + +test_that("spark.glm and predict", { + training <- suppressWarnings(createDataFrame(iris)) + # gaussian family + model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # poisson family + model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, + family = poisson(link = identity)) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, + data = iris, family = poisson(link = identity)), iris)) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # Test stats::predict is working + x <- rnorm(15) + y <- x + rnorm(15) + expect_equal(length(predict(lm(y ~ x))), 15) +}) + +test_that("spark.glm summary", { + # gaussian family + training <- suppressWarnings(createDataFrame(iris)) + stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species)) + + rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + out <- capture.output(print(stats)) + expect_match(out[2], "Deviance Residuals:") + expect_true(any(grepl("AIC: 59.22", out))) + + # binomial family + df <- suppressWarnings(createDataFrame(iris)) + training <- df[df$Species %in% c("versicolor", "virginica"), ] + stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width, + family = binomial(link = "logit"))) + + rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] + rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, + family = binomial(link = "logit"))) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Sepal_Width"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # Test spark.glm works with weighted dataset + a1 <- c(0, 1, 2, 3) + a2 <- c(5, 2, 1, 3) + w <- c(1, 2, 3, 4) + b <- c(1, 0, 1, 0) + data <- as.data.frame(cbind(a1, a2, w, b)) + df <- createDataFrame(data) + + stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w")) + rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w)) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-3)) + expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # Test summary works on base GLM models + baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) + baseSummary <- summary(baseModel) + expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) + + # Test spark.glm works with regularization parameter + data <- as.data.frame(cbind(a1, a2, b)) + df <- suppressWarnings(createDataFrame(data)) + regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0)) + expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result + + # Test spark.glm works on collinear data + A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2) + b <- c(1, 2, 3, 4) + data <- as.data.frame(cbind(A, b)) + df <- createDataFrame(data) + stats <- summary(spark.glm(df, b ~ . - 1)) + coefs <- unlist(stats$coefficients) + expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4)) +}) + +test_that("spark.glm save/load", { + training <- suppressWarnings(createDataFrame(iris)) + m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) + s <- summary(m) + + modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp") + write.ml(m, modelPath) + expect_error(write.ml(m, modelPath)) + write.ml(m, modelPath, overwrite = TRUE) + m2 <- read.ml(modelPath) + s2 <- summary(m2) + + expect_equal(s$coefficients, s2$coefficients) + expect_equal(rownames(s$coefficients), rownames(s2$coefficients)) + expect_equal(s$dispersion, s2$dispersion) + expect_equal(s$null.deviance, s2$null.deviance) + expect_equal(s$deviance, s2$deviance) + expect_equal(s$df.null, s2$df.null) + expect_equal(s$df.residual, s2$df.residual) + expect_equal(s$aic, s2$aic) + expect_equal(s$iter, s2$iter) + expect_true(!s$is.loaded) + expect_true(s2$is.loaded) + + unlink(modelPath) +}) + +test_that("formula of glm", { + training <- suppressWarnings(createDataFrame(iris)) + # dot minus and intercept vs native glm + model <- glm(Sepal_Width ~ . - Species + 0, data = training) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # feature interaction vs native glm + model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # glm should work with long formula + training <- suppressWarnings(createDataFrame(iris)) + training$LongLongLongLongLongName <- training$Sepal_Width + training$VeryLongLongLongLonLongName <- training$Sepal_Length + training$AnotherLongLongLongLongName <- training$Species + model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName, + data = training) + vals <- collect(select(predict(model, training), "prediction")) + rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) +}) + +test_that("glm and predict", { + training <- suppressWarnings(createDataFrame(iris)) + # gaussian family + model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # poisson family + model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, + family = poisson(link = identity)) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, + data = iris, family = poisson(link = identity)), iris)) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # Test stats::predict is working + x <- rnorm(15) + y <- x + rnorm(15) + expect_equal(length(predict(lm(y ~ x))), 15) +}) + +test_that("glm summary", { + # gaussian family + training <- suppressWarnings(createDataFrame(iris)) + stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training)) + + rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # binomial family + df <- suppressWarnings(createDataFrame(iris)) + training <- df[df$Species %in% c("versicolor", "virginica"), ] + stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, + family = binomial(link = "logit"))) + + rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] + rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, + family = binomial(link = "logit"))) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Sepal_Width"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # Test summary works on base GLM models + baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) + baseSummary <- summary(baseModel) + expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) +}) + +test_that("glm save/load", { + training <- suppressWarnings(createDataFrame(iris)) + m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) + s <- summary(m) + + modelPath <- tempfile(pattern = "glm", fileext = ".tmp") + write.ml(m, modelPath) + expect_error(write.ml(m, modelPath)) + write.ml(m, modelPath, overwrite = TRUE) + m2 <- read.ml(modelPath) + s2 <- summary(m2) + + expect_equal(s$coefficients, s2$coefficients) + expect_equal(rownames(s$coefficients), rownames(s2$coefficients)) + expect_equal(s$dispersion, s2$dispersion) + expect_equal(s$null.deviance, s2$null.deviance) + expect_equal(s$deviance, s2$deviance) + expect_equal(s$df.null, s2$df.null) + expect_equal(s$df.residual, s2$df.residual) + expect_equal(s$aic, s2$aic) + expect_equal(s$iter, s2$iter) + expect_true(!s$is.loaded) + expect_true(s2$is.loaded) + + unlink(modelPath) +}) + +test_that("spark.isoreg", { + label <- c(7.0, 5.0, 3.0, 5.0, 1.0) + feature <- c(0.0, 1.0, 2.0, 3.0, 4.0) + weight <- c(1.0, 1.0, 1.0, 1.0, 1.0) + data <- as.data.frame(cbind(label, feature, weight)) + df <- createDataFrame(data) + + model <- spark.isoreg(df, label ~ feature, isotonic = FALSE, + weightCol = "weight") + # only allow one variable on the right hand side of the formula + expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE)) + result <- summary(model) + expect_equal(result$predictions, list(7, 5, 4, 4, 1)) + + # Test model prediction + predict_data <- list(list(-2.0), list(-1.0), list(0.5), + list(0.75), list(1.0), list(2.0), list(9.0)) + predict_df <- createDataFrame(predict_data, c("feature")) + predict_result <- collect(select(predict(model, predict_df), "prediction")) + expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0)) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-isoreg", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + expect_equal(result, summary(model2)) + + unlink(modelPath) +}) + +test_that("spark.survreg", { + # R code to reproduce the result. + # + #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), + #' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) + #' library(survival) + #' model <- survreg(Surv(time, status) ~ x + sex, rData) + #' summary(model) + #' predict(model, data) + # + # -- output of 'summary(model)' + # + # Value Std. Error z p + # (Intercept) 1.315 0.270 4.88 1.07e-06 + # x -0.190 0.173 -1.10 2.72e-01 + # sex -0.253 0.329 -0.77 4.42e-01 + # Log(scale) -1.160 0.396 -2.93 3.41e-03 + # + # -- output of 'predict(model, data)' + # + # 1 2 3 4 5 6 7 + # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269 + # + data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0), + list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1)) + df <- createDataFrame(data, c("time", "status", "x", "sex")) + model <- spark.survreg(df, Surv(time, status) ~ x + sex) + stats <- summary(model) + coefs <- as.vector(stats$coefficients[, 1]) + rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800) + expect_equal(coefs, rCoefs, tolerance = 1e-4) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "x", "sex", "Log(scale)"))) + p <- collect(select(predict(model, df), "prediction")) + expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035, + 2.390146, 2.891269, 2.891269), tolerance = 1e-4) + + # Test model save/load + modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + coefs2 <- as.vector(stats2$coefficients[, 1]) + expect_equal(coefs, coefs2) + expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients)) + + unlink(modelPath) + + # Test survival::survreg + if (requireNamespace("survival", quietly = TRUE)) { + rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), + x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) + expect_error( + model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData), + NA) + expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4) + } +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_stat.R b/R/pkg/inst/tests/testthat/test_mllib_stat.R new file mode 100644 index 0000000000000..beb148e7702fd --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_stat.R @@ -0,0 +1,53 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib statistics algorithms") + +# Tests for MLlib statistics algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +test_that("spark.kstest", { + data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5)) + df <- createDataFrame(data) + testResult <- spark.kstest(df, "test", "norm") + stats <- summary(testResult) + + rStats <- ks.test(data$test, "pnorm", alternative = "two.sided") + + expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) + expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) + expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") + + testResult <- spark.kstest(df, "test", "norm", -0.5) + stats <- summary(testResult) + + rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided") + + expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) + expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) + expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") + + # Test print.summary.KSTest + printStats <- capture.output(print.summary.KSTest(stats)) + expect_match(printStats[1], "Kolmogorov-Smirnov test summary:") + expect_match(printStats[5], + "Low presumption against null hypothesis: Sample follows theoretical distribution. ") +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_mllib_tree.R b/R/pkg/inst/tests/testthat/test_mllib_tree.R new file mode 100644 index 0000000000000..5d13539be8a83 --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_tree.R @@ -0,0 +1,203 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib tree-based algorithms") + +# Tests for MLlib tree-based algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +absoluteSparkPath <- function(x) { + sparkHome <- sparkR.conf("spark.home") + file.path(sparkHome, x) +} + +test_that("spark.gbt", { + # regression + data <- suppressWarnings(createDataFrame(longley)) + model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123) + predictions <- collect(predict(model, data)) + expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187, + 63.221, 63.639, 64.989, 63.761, + 66.019, 67.857, 68.169, 66.513, + 68.655, 69.564, 69.331, 70.551), + tolerance = 1e-4) + stats <- summary(model) + expect_equal(stats$numTrees, 20) + expect_equal(stats$formula, "Employed ~ .") + expect_equal(stats$numFeatures, 6) + expect_equal(length(stats$treeWeights), 20) + + modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats$formula, stats2$formula) + expect_equal(stats$numFeatures, stats2$numFeatures) + expect_equal(stats$features, stats2$features) + expect_equal(stats$featureImportances, stats2$featureImportances) + expect_equal(stats$numTrees, stats2$numTrees) + expect_equal(stats$treeWeights, stats2$treeWeights) + + unlink(modelPath) + + # classification + # label must be binary - GBTClassifier currently only supports binary classification. + iris2 <- iris[iris$Species != "virginica", ] + data <- suppressWarnings(createDataFrame(iris2)) + model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification") + stats <- summary(model) + expect_equal(stats$numFeatures, 2) + expect_equal(stats$numTrees, 20) + expect_error(capture.output(stats), NA) + expect_true(length(capture.output(stats)) > 6) + predictions <- collect(predict(model, data))$prediction + # test string prediction values + expect_equal(length(grep("setosa", predictions)), 50) + expect_equal(length(grep("versicolor", predictions)), 50) + + modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats$depth, stats2$depth) + expect_equal(stats$numNodes, stats2$numNodes) + expect_equal(stats$numClasses, stats2$numClasses) + + unlink(modelPath) + + iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1) + df <- suppressWarnings(createDataFrame(iris2)) + m <- spark.gbt(df, NumericSpecies ~ ., type = "classification") + s <- summary(m) + # test numeric prediction values + expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction)) + expect_equal(s$numFeatures, 5) + expect_equal(s$numTrees, 20) + + # spark.gbt classification can work on libsvm data + data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"), + source = "libsvm") + model <- spark.gbt(data, label ~ features, "classification") + expect_equal(summary(model)$numFeatures, 692) +}) + +test_that("spark.randomForest", { + # regression + data <- suppressWarnings(createDataFrame(longley)) + model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, + numTrees = 1) + + predictions <- collect(predict(model, data)) + expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187, + 63.221, 63.639, 64.989, 63.761, + 66.019, 67.857, 68.169, 66.513, + 68.655, 69.564, 69.331, 70.551), + tolerance = 1e-4) + + stats <- summary(model) + expect_equal(stats$numTrees, 1) + expect_error(capture.output(stats), NA) + expect_true(length(capture.output(stats)) > 6) + + model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, + numTrees = 20, seed = 123) + predictions <- collect(predict(model, data)) + expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070, + 63.53160, 64.05470, 65.12710, 64.30450, + 66.70910, 67.86125, 68.08700, 67.21865, + 68.89275, 69.53180, 69.39640, 69.68250), + + tolerance = 1e-4) + stats <- summary(model) + expect_equal(stats$numTrees, 20) + + modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats$formula, stats2$formula) + expect_equal(stats$numFeatures, stats2$numFeatures) + expect_equal(stats$features, stats2$features) + expect_equal(stats$featureImportances, stats2$featureImportances) + expect_equal(stats$numTrees, stats2$numTrees) + expect_equal(stats$treeWeights, stats2$treeWeights) + + unlink(modelPath) + + # classification + data <- suppressWarnings(createDataFrame(iris)) + model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification", + maxDepth = 5, maxBins = 16) + + stats <- summary(model) + expect_equal(stats$numFeatures, 2) + expect_equal(stats$numTrees, 20) + expect_error(capture.output(stats), NA) + expect_true(length(capture.output(stats)) > 6) + # Test string prediction values + predictions <- collect(predict(model, data))$prediction + expect_equal(length(grep("setosa", predictions)), 50) + expect_equal(length(grep("versicolor", predictions)), 50) + + modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + stats2 <- summary(model2) + expect_equal(stats$depth, stats2$depth) + expect_equal(stats$numNodes, stats2$numNodes) + expect_equal(stats$numClasses, stats2$numClasses) + + unlink(modelPath) + + # Test numeric response variable + labelToIndex <- function(species) { + switch(as.character(species), + setosa = 0.0, + versicolor = 1.0, + virginica = 2.0 + ) + } + iris$NumericSpecies <- lapply(iris$Species, labelToIndex) + data <- suppressWarnings(createDataFrame(iris[-5])) + model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification", + maxDepth = 5, maxBins = 16) + stats <- summary(model) + expect_equal(stats$numFeatures, 2) + expect_equal(stats$numTrees, 20) + # Test numeric prediction values + predictions <- collect(predict(model, data))$prediction + expect_equal(length(grep("1.0", predictions)), 50) + expect_equal(length(grep("2.0", predictions)), 50) + + # spark.randomForest classification can work on libsvm data + data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), + source = "libsvm") + model <- spark.randomForest(data, label ~ features, "classification") + expect_equal(summary(model)$numFeatures, 4) +}) + +sparkR.session.stop() diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 5b42843717e90..f219c5605b643 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -86,7 +86,7 @@ public int compare(RecordPointerAndKeyPrefix r1, RecordPointerAndKeyPrefix r2) { private final PrefixComparators.RadixSortSupport radixSortSupport; /** - * Within this buffer, position {@code 2 * i} holds a pointer pointer to the record at + * Within this buffer, position {@code 2 * i} holds a pointer to the record at * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix. * * Only part of the array will be used to store the pointers, the rest part is preserved as diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java index 430bf677edbdf..d9f84d10e9051 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java @@ -25,7 +25,7 @@ * Supports sorting an array of (record pointer, key prefix) pairs. * Used in {@link UnsafeInMemorySorter}. *
- * Within each long[] buffer, position {@code 2 * i} holds a pointer pointer to the record at + * Within each long[] buffer, position {@code 2 * i} holds a pointer to the record at * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix. */ public final class UnsafeSortDataFormat diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 6f5c31d7ab71c..4ca442b629fd9 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -317,7 +317,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf, pool } - // Make sure that that we aren't going to exceed the max RPC message size by making sure + // Make sure that we aren't going to exceed the max RPC message size by making sure // we use broadcast to send large map output statuses. if (minSizeForBroadcast > maxRpcMessageSize) { val msg = s"spark.shuffle.mapOutput.minSizeForBroadcast ($minSizeForBroadcast bytes) must " + diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 7745387dbceba..8c1b5f7bf0d9b 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -98,7 +98,7 @@ case class FetchFailed( * 4 task failures, instead we immediately go back to the stage which generated the map output, * and regenerate the missing data. (2) we don't count fetch failures for blacklisting, since * presumably its not the fault of the executor where the task ran, but the executor which - * stored the data. This is especially important because we we might rack up a bunch of + * stored the data. This is especially important because we might rack up a bunch of * fetch-failures in rapid succession, on all nodes of the cluster, due to one bad node. */ override def countTowardsTaskFailures: Boolean = false diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala index 79f4d06c8460e..320af5cf97550 100644 --- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala +++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala @@ -43,7 +43,7 @@ import org.apache.spark.util.{ThreadUtils, Utils} * Execute using * ./bin/spark-class org.apache.spark.deploy.FaultToleranceTest * - * Make sure that that the environment includes the following properties in SPARK_DAEMON_JAVA_OPTS + * Make sure that the environment includes the following properties in SPARK_DAEMON_JAVA_OPTS * *and* SPARK_JAVA_OPTS: * - spark.deploy.recoveryMode=ZOOKEEPER * - spark.deploy.zookeeper.url=172.17.42.1:2181 diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 3011ed0f95d8e..cd241d6d22745 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -97,6 +97,13 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .map { d => Utils.resolveURI(d).toString } .getOrElse(DEFAULT_LOG_DIR) + private val HISTORY_UI_ACLS_ENABLE = conf.getBoolean("spark.history.ui.acls.enable", false) + private val HISTORY_UI_ADMIN_ACLS = conf.get("spark.history.ui.admin.acls", "") + private val HISTORY_UI_ADMIN_ACLS_GROUPS = conf.get("spark.history.ui.admin.acls.groups", "") + logInfo(s"History server ui acls " + (if (HISTORY_UI_ACLS_ENABLE) "enabled" else "disabled") + + "; users with admin permissions: " + HISTORY_UI_ADMIN_ACLS.toString + + "; groups with admin permissions" + HISTORY_UI_ADMIN_ACLS_GROUPS.toString) + private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) private val fs = Utils.getHadoopFileSystem(logDir, hadoopConf) @@ -250,13 +257,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val appListener = replay(fileStatus, isApplicationCompleted(fileStatus), replayBus) if (appListener.appId.isDefined) { - val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false) - ui.getSecurityManager.setAcls(uiAclsEnabled) + ui.getSecurityManager.setAcls(HISTORY_UI_ACLS_ENABLE) // make sure to set admin acls before view acls so they are properly picked up - ui.getSecurityManager.setAdminAcls(appListener.adminAcls.getOrElse("")) - ui.getSecurityManager.setViewAcls(attempt.sparkUser, - appListener.viewAcls.getOrElse("")) - ui.getSecurityManager.setAdminAclsGroups(appListener.adminAclsGroups.getOrElse("")) + val adminAcls = HISTORY_UI_ADMIN_ACLS + "," + appListener.adminAcls.getOrElse("") + ui.getSecurityManager.setAdminAcls(adminAcls) + ui.getSecurityManager.setViewAcls(attempt.sparkUser, appListener.viewAcls.getOrElse("")) + val adminAclsGroups = HISTORY_UI_ADMIN_ACLS_GROUPS + "," + + appListener.adminAclsGroups.getOrElse("") + ui.getSecurityManager.setAdminAclsGroups(adminAclsGroups) ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse("")) Some(LoadedAppUI(ui, updateProbe(appId, attemptId, attempt.fileSize))) } else { diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 2b00a4a6b3e52..54f39f7620e5d 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -291,7 +291,7 @@ object HistoryServer extends Logging { /** * Create a security manager. - * This turns off security in the SecurityManager, so that the the History Server can start + * This turns off security in the SecurityManager, so that the History Server can start * in a Spark cluster where security is enabled. * @param config configuration for the SecurityManager constructor * @return the security manager for use in constructing the History Server. diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index f963a4606068b..e48817ebbafdd 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -445,12 +445,25 @@ private[deploy] class Worker( // Create local dirs for the executor. These are passed to the executor via the // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the // application finishes. - val appLocalDirs = appDirectories.getOrElse(appId, - Utils.getOrCreateLocalRootDirs(conf).map { dir => - val appDir = Utils.createDirectory(dir, namePrefix = "executor") - Utils.chmod700(appDir) - appDir.getAbsolutePath() - }.toSeq) + val appLocalDirs = appDirectories.getOrElse(appId, { + val localRootDirs = Utils.getOrCreateLocalRootDirs(conf) + val dirs = localRootDirs.flatMap { dir => + try { + val appDir = Utils.createDirectory(dir, namePrefix = "executor") + Utils.chmod700(appDir) + Some(appDir.getAbsolutePath()) + } catch { + case e: IOException => + logWarning(s"${e.getMessage}. Ignoring this directory.") + None + } + }.toSeq + if (dirs.isEmpty) { + throw new IOException("No subfolder can be created in " + + s"${localRootDirs.mkString(",")}.") + } + dirs + }) appDirectories(appId) = appLocalDirs val manager = new ExecutorRunner( appId, diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index 92a27902c6696..4a38560d8deca 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -92,10 +92,9 @@ private[spark] class CoarseGrainedExecutorBackend( if (executor == null) { exitExecutor(1, "Received LaunchTask command but executor was null") } else { - val taskDesc = ser.deserialize[TaskDescription](data.value) + val taskDesc = TaskDescription.decode(data.value) logInfo("Got assigned task " + taskDesc.taskId) - executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, - taskDesc.name, taskDesc.serializedTask) + executor.launchTask(this, taskDesc) } case KillTask(taskId, _, interruptThread) => diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 3346f6dd1f975..789198f52ca45 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -26,7 +26,7 @@ import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ -import scala.collection.mutable.{ArrayBuffer, HashMap} +import scala.collection.mutable.{ArrayBuffer, HashMap, Map} import scala.util.control.NonFatal import org.apache.spark._ @@ -34,7 +34,7 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.rpc.RpcTimeout -import org.apache.spark.scheduler.{AccumulableInfo, DirectTaskResult, IndirectTaskResult, Task} +import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Task, TaskDescription} import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{StorageLevel, TaskResultBlockId} import org.apache.spark.util._ @@ -148,15 +148,9 @@ private[spark] class Executor( startDriverHeartbeater() - def launchTask( - context: ExecutorBackend, - taskId: Long, - attemptNumber: Int, - taskName: String, - serializedTask: ByteBuffer): Unit = { - val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName, - serializedTask) - runningTasks.put(taskId, tr) + def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = { + val tr = new TaskRunner(context, taskDescription) + runningTasks.put(taskDescription.taskId, tr) threadPool.execute(tr) } @@ -212,13 +206,12 @@ private[spark] class Executor( class TaskRunner( execBackend: ExecutorBackend, - val taskId: Long, - val attemptNumber: Int, - taskName: String, - serializedTask: ByteBuffer) + private val taskDescription: TaskDescription) extends Runnable { + val taskId = taskDescription.taskId val threadName = s"Executor task launch worker for task $taskId" + private val taskName = taskDescription.name /** Whether this task has been killed. */ @volatile private var killed = false @@ -287,16 +280,14 @@ private[spark] class Executor( startGCTime = computeTotalGcTime() try { - val (taskFiles, taskJars, taskProps, taskBytes) = - Task.deserializeWithDependencies(serializedTask) - // Must be set before updateDependencies() is called, in case fetching dependencies // requires access to properties contained within (e.g. for access control). - Executor.taskDeserializationProps.set(taskProps) + Executor.taskDeserializationProps.set(taskDescription.properties) - updateDependencies(taskFiles, taskJars) - task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader) - task.localProperties = taskProps + updateDependencies(taskDescription.addedFiles, taskDescription.addedJars) + task = ser.deserialize[Task[Any]]( + taskDescription.serializedTask, Thread.currentThread.getContextClassLoader) + task.localProperties = taskDescription.properties task.setTaskMemoryManager(taskMemoryManager) // If this task has been killed before we deserialized it, let's quit now. Otherwise, @@ -321,7 +312,7 @@ private[spark] class Executor( val value = try { val res = task.run( taskAttemptId = taskId, - attemptNumber = attemptNumber, + attemptNumber = taskDescription.attemptNumber, metricsSystem = env.metricsSystem) threwException = false res @@ -637,7 +628,7 @@ private[spark] class Executor( * Download any missing dependencies if we receive a new set of files and JARs from the * SparkContext. Also adds any new JARs we fetched to the class loader. */ - private def updateDependencies(newFiles: HashMap[String, Long], newJars: HashMap[String, Long]) { + private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]) { lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) synchronized { // Fetch missing dependencies diff --git a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala index f7a991770d402..8dd1a1ea059be 100644 --- a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala @@ -92,7 +92,7 @@ class ShuffleReadMetrics private[spark] () extends Serializable { private[spark] def setRecordsRead(v: Long): Unit = _recordsRead.setValue(v) /** - * Resets the value of the current metrics (`this`) and and merges all the independent + * Resets the value of the current metrics (`this`) and merges all the independent * [[TempShuffleReadMetrics]] into `this`. */ private[spark] def setMergeValues(metrics: Seq[TempShuffleReadMetrics]): Unit = { diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index dc70eb82d2b54..3d4ea3cccc934 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -37,7 +37,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.Utils /** - * A BlockTransferService that uses Netty to fetch a set of blocks at at time. + * A BlockTransferService that uses Netty to fetch a set of blocks at time. */ private[spark] class NettyBlockTransferService( conf: SparkConf, diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 5becca6c06a6c..51976f666dfce 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -215,89 +215,3 @@ private[spark] abstract class Task[T]( } } } - -/** - * Handles transmission of tasks and their dependencies, because this can be slightly tricky. We - * need to send the list of JARs and files added to the SparkContext with each task to ensure that - * worker nodes find out about it, but we can't make it part of the Task because the user's code in - * the task might depend on one of the JARs. Thus we serialize each task as multiple objects, by - * first writing out its dependencies. - */ -private[spark] object Task { - /** - * Serialize a task and the current app dependencies (files and JARs added to the SparkContext) - */ - def serializeWithDependencies( - task: Task[_], - currentFiles: mutable.Map[String, Long], - currentJars: mutable.Map[String, Long], - serializer: SerializerInstance) - : ByteBuffer = { - - val out = new ByteBufferOutputStream(4096) - val dataOut = new DataOutputStream(out) - - // Write currentFiles - dataOut.writeInt(currentFiles.size) - for ((name, timestamp) <- currentFiles) { - dataOut.writeUTF(name) - dataOut.writeLong(timestamp) - } - - // Write currentJars - dataOut.writeInt(currentJars.size) - for ((name, timestamp) <- currentJars) { - dataOut.writeUTF(name) - dataOut.writeLong(timestamp) - } - - // Write the task properties separately so it is available before full task deserialization. - val propBytes = Utils.serialize(task.localProperties) - dataOut.writeInt(propBytes.length) - dataOut.write(propBytes) - - // Write the task itself and finish - dataOut.flush() - val taskBytes = serializer.serialize(task) - Utils.writeByteBuffer(taskBytes, out) - out.close() - out.toByteBuffer - } - - /** - * Deserialize the list of dependencies in a task serialized with serializeWithDependencies, - * and return the task itself as a serialized ByteBuffer. The caller can then update its - * ClassLoaders and deserialize the task. - * - * @return (taskFiles, taskJars, taskProps, taskBytes) - */ - def deserializeWithDependencies(serializedTask: ByteBuffer) - : (HashMap[String, Long], HashMap[String, Long], Properties, ByteBuffer) = { - - val in = new ByteBufferInputStream(serializedTask) - val dataIn = new DataInputStream(in) - - // Read task's files - val taskFiles = new HashMap[String, Long]() - val numFiles = dataIn.readInt() - for (i <- 0 until numFiles) { - taskFiles(dataIn.readUTF()) = dataIn.readLong() - } - - // Read task's JARs - val taskJars = new HashMap[String, Long]() - val numJars = dataIn.readInt() - for (i <- 0 until numJars) { - taskJars(dataIn.readUTF()) = dataIn.readLong() - } - - val propLength = dataIn.readInt() - val propBytes = new Array[Byte](propLength) - dataIn.readFully(propBytes, 0, propLength) - val taskProps = Utils.deserialize[Properties](propBytes) - - // Create a sub-buffer for the rest of the data, which is the serialized Task object - val subBuffer = serializedTask.slice() // ByteBufferInputStream will have read just up to task - (taskFiles, taskJars, taskProps, subBuffer) - } -} diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 45c742cbff5e7..78aa5c40010cc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -17,13 +17,31 @@ package org.apache.spark.scheduler +import java.io.{DataInputStream, DataOutputStream} import java.nio.ByteBuffer +import java.util.Properties -import org.apache.spark.util.SerializableBuffer +import scala.collection.JavaConverters._ +import scala.collection.mutable.{HashMap, Map} + +import org.apache.spark.util.{ByteBufferInputStream, ByteBufferOutputStream, Utils} /** * Description of a task that gets passed onto executors to be executed, usually created by * `TaskSetManager.resourceOffer`. + * + * TaskDescriptions and the associated Task need to be serialized carefully for two reasons: + * + * (1) When a TaskDescription is received by an Executor, the Executor needs to first get the + * list of JARs and files and add these to the classpath, and set the properties, before + * deserializing the Task object (serializedTask). This is why the Properties are included + * in the TaskDescription, even though they're also in the serialized task. + * (2) Because a TaskDescription is serialized and sent to an executor for each task, efficient + * serialization (both in terms of serialization time and serialized buffer size) is + * important. For this reason, we serialize TaskDescriptions ourselves with the + * TaskDescription.encode and TaskDescription.decode methods. This results in a smaller + * serialized size because it avoids serializing unnecessary fields in the Map objects + * (which can introduce significant overhead when the maps are small). */ private[spark] class TaskDescription( val taskId: Long, @@ -31,13 +49,88 @@ private[spark] class TaskDescription( val executorId: String, val name: String, val index: Int, // Index within this task's TaskSet - _serializedTask: ByteBuffer) - extends Serializable { + val addedFiles: Map[String, Long], + val addedJars: Map[String, Long], + val properties: Properties, + val serializedTask: ByteBuffer) { + + override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index) +} - // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer - private val buffer = new SerializableBuffer(_serializedTask) +private[spark] object TaskDescription { + private def serializeStringLongMap(map: Map[String, Long], dataOut: DataOutputStream): Unit = { + dataOut.writeInt(map.size) + for ((key, value) <- map) { + dataOut.writeUTF(key) + dataOut.writeLong(value) + } + } - def serializedTask: ByteBuffer = buffer.value + def encode(taskDescription: TaskDescription): ByteBuffer = { + val bytesOut = new ByteBufferOutputStream(4096) + val dataOut = new DataOutputStream(bytesOut) - override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index) + dataOut.writeLong(taskDescription.taskId) + dataOut.writeInt(taskDescription.attemptNumber) + dataOut.writeUTF(taskDescription.executorId) + dataOut.writeUTF(taskDescription.name) + dataOut.writeInt(taskDescription.index) + + // Write files. + serializeStringLongMap(taskDescription.addedFiles, dataOut) + + // Write jars. + serializeStringLongMap(taskDescription.addedJars, dataOut) + + // Write properties. + dataOut.writeInt(taskDescription.properties.size()) + taskDescription.properties.asScala.foreach { case (key, value) => + dataOut.writeUTF(key) + dataOut.writeUTF(value) + } + + // Write the task. The task is already serialized, so write it directly to the byte buffer. + Utils.writeByteBuffer(taskDescription.serializedTask, bytesOut) + + dataOut.close() + bytesOut.close() + bytesOut.toByteBuffer + } + + private def deserializeStringLongMap(dataIn: DataInputStream): HashMap[String, Long] = { + val map = new HashMap[String, Long]() + val mapSize = dataIn.readInt() + for (i <- 0 until mapSize) { + map(dataIn.readUTF()) = dataIn.readLong() + } + map + } + + def decode(byteBuffer: ByteBuffer): TaskDescription = { + val dataIn = new DataInputStream(new ByteBufferInputStream(byteBuffer)) + val taskId = dataIn.readLong() + val attemptNumber = dataIn.readInt() + val executorId = dataIn.readUTF() + val name = dataIn.readUTF() + val index = dataIn.readInt() + + // Read files. + val taskFiles = deserializeStringLongMap(dataIn) + + // Read jars. + val taskJars = deserializeStringLongMap(dataIn) + + // Read properties. + val properties = new Properties() + val numProperties = dataIn.readInt() + for (i <- 0 until numProperties) { + properties.setProperty(dataIn.readUTF(), dataIn.readUTF()) + } + + // Create a sub-buffer for the serialized task into its own buffer (to be deserialized later). + val serializedTask = byteBuffer.slice() + + new TaskDescription(taskId, attemptNumber, executorId, name, index, taskFiles, taskJars, + properties, serializedTask) + } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index b1addc128e696..a284f7956cd31 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -143,8 +143,12 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul logError( "Could not deserialize TaskEndReason: ClassNotFound with classloader " + loader) case ex: Exception => // No-op + } finally { + // If there's an error while deserializing the TaskEndReason, this Runnable + // will die. Still tell the scheduler about the task failure, to avoid a hang + // where the scheduler thinks the task is still running. + scheduler.handleFailedTask(taskSetManager, tid, taskState, reason) } - scheduler.handleFailedTask(taskSetManager, tid, taskState, reason) } }) } catch { diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 3756c216f5ecb..c7ff13cebfaaa 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -446,9 +446,8 @@ private[spark] class TaskSetManager( lastLaunchTime = curTime } // Serialize and return the task - val startTime = clock.getTimeMillis() val serializedTask: ByteBuffer = try { - Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser) + ser.serialize(task) } catch { // If the task cannot be serialized, then there's no point to re-attempt the task, // as it will always fail. So just abort the whole task-set. @@ -475,8 +474,16 @@ private[spark] class TaskSetManager( s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit} bytes)") sched.dagScheduler.taskStarted(task, info) - new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId, - taskName, index, serializedTask) + new TaskDescription( + taskId, + attemptNum, + execId, + taskName, + index, + sched.sc.addedFiles, + sched.sc.addedJars, + task.localProperties, + serializedTask) } } else { None diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 3452487e72e88..31575c0ca0d15 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -98,11 +98,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // Executors that have been lost, but for which we don't yet know the real exit reason. protected val executorsPendingLossReason = new HashSet[String] - // If this DriverEndpoint is changed to support multiple threads, - // then this may need to be changed so that we don't share the serializer - // instance across threads - private val ser = SparkEnv.get.closureSerializer.newInstance() - protected val addressToExecutorId = new HashMap[RpcAddress, String] private val reviveThread = @@ -249,7 +244,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // Launch tasks returned by a set of resource offers private def launchTasks(tasks: Seq[Seq[TaskDescription]]) { for (task <- tasks.flatten) { - val serializedTask = ser.serialize(task) + val serializedTask = TaskDescription.encode(task) if (serializedTask.limit >= maxRpcMessageSize) { scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr => try { diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala index 7a73e8ed8a38f..625f998cd4608 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala @@ -84,8 +84,7 @@ private[spark] class LocalEndpoint( val offers = IndexedSeq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores)) for (task <- scheduler.resourceOffers(offers).flatten) { freeCores -= scheduler.CPUS_PER_TASK - executor.launchTask(executorBackend, taskId = task.taskId, attemptNumber = task.attemptNumber, - task.name, task.serializedTask) + executor.launchTask(executorBackend, task) } } } diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index 386fdfd218a88..3bfdf95db84c6 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -350,7 +350,7 @@ object SizeEstimator extends Logging { // 3. consistent fields layouts throughout the hierarchy: This means we should layout // superclass first. And we can use superclass's shellSize as a starting point to layout the // other fields in this class. - // 4. class alignment: HotSpot rounds field blocks up to to HeapOopSize not 4 bytes, confirmed + // 4. class alignment: HotSpot rounds field blocks up to HeapOopSize not 4 bytes, confirmed // with Aleksey. see https://bugs.openjdk.java.net/browse/CODETOOLS-7901322 // // The real world field layout is much more complicated. There are three kinds of fields diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 078cc3d5b4f0c..0dcf0307e113f 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -237,9 +237,11 @@ private[spark] object Utils extends Logging { if (bb.hasArray) { out.write(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()) } else { + val originalPosition = bb.position() val bbval = new Array[Byte](bb.remaining()) bb.get(bbval) out.write(bbval) + bb.position(originalPosition) } } @@ -250,9 +252,11 @@ private[spark] object Utils extends Logging { if (bb.hasArray) { out.write(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()) } else { + val originalPosition = bb.position() val bbval = new Array[Byte](bb.remaining()) bb.get(bbval) out.write(bbval) + bb.position(originalPosition) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala index e3304be792af7..7998e3702c122 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala @@ -253,7 +253,7 @@ class ApplicationCacheSuite extends SparkFunSuite with Logging with MockitoSugar assertNotFound(appId, None) } - test("Test that if an attempt ID is is set, it must be used in lookups") { + test("Test that if an attempt ID is set, it must be used in lookups") { val operations = new StubCacheOperations() val clock = new ManualClock(1) implicit val cache = new ApplicationCache(operations, retainedApplications = 10, clock = clock) diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index 027f412c7581b..8cb359ed45192 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -35,10 +35,11 @@ import org.scalatest.BeforeAndAfter import org.scalatest.Matchers import org.scalatest.concurrent.Eventually._ -import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.io._ import org.apache.spark.scheduler._ +import org.apache.spark.security.GroupMappingServiceProvider import org.apache.spark.util.{Clock, JsonProtocol, ManualClock, Utils} class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { @@ -474,6 +475,102 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc } } + test("support history server ui admin acls") { + def createAndCheck(conf: SparkConf, properties: (String, String)*) + (checkFn: SecurityManager => Unit): Unit = { + // Empty the testDir for each test. + if (testDir.exists() && testDir.isDirectory) { + testDir.listFiles().foreach { f => if (f.isFile) f.delete() } + } + + var provider: FsHistoryProvider = null + try { + provider = new FsHistoryProvider(conf) + val log = newLogFile("app1", Some("attempt1"), inProgress = false) + writeFile(log, true, None, + SparkListenerApplicationStart("app1", Some("app1"), System.currentTimeMillis(), + "test", Some("attempt1")), + SparkListenerEnvironmentUpdate(Map( + "Spark Properties" -> properties.toSeq, + "JVM Information" -> Seq.empty, + "System Properties" -> Seq.empty, + "Classpath Entries" -> Seq.empty + )), + SparkListenerApplicationEnd(System.currentTimeMillis())) + + provider.checkForLogs() + val appUi = provider.getAppUI("app1", Some("attempt1")) + + assert(appUi.nonEmpty) + val securityManager = appUi.get.ui.securityManager + checkFn(securityManager) + } finally { + if (provider != null) { + provider.stop() + } + } + } + + // Test both history ui admin acls and application acls are configured. + val conf1 = createTestConf() + .set("spark.history.ui.acls.enable", "true") + .set("spark.history.ui.admin.acls", "user1,user2") + .set("spark.history.ui.admin.acls.groups", "group1") + .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName) + + createAndCheck(conf1, ("spark.admin.acls", "user"), ("spark.admin.acls.groups", "group")) { + securityManager => + // Test whether user has permission to access UI. + securityManager.checkUIViewPermissions("user1") should be (true) + securityManager.checkUIViewPermissions("user2") should be (true) + securityManager.checkUIViewPermissions("user") should be (true) + securityManager.checkUIViewPermissions("abc") should be (false) + + // Test whether user with admin group has permission to access UI. + securityManager.checkUIViewPermissions("user3") should be (true) + securityManager.checkUIViewPermissions("user4") should be (true) + securityManager.checkUIViewPermissions("user5") should be (true) + securityManager.checkUIViewPermissions("user6") should be (false) + } + + // Test only history ui admin acls are configured. + val conf2 = createTestConf() + .set("spark.history.ui.acls.enable", "true") + .set("spark.history.ui.admin.acls", "user1,user2") + .set("spark.history.ui.admin.acls.groups", "group1") + .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName) + createAndCheck(conf2) { securityManager => + // Test whether user has permission to access UI. + securityManager.checkUIViewPermissions("user1") should be (true) + securityManager.checkUIViewPermissions("user2") should be (true) + // Check the unknown "user" should return false + securityManager.checkUIViewPermissions("user") should be (false) + + // Test whether user with admin group has permission to access UI. + securityManager.checkUIViewPermissions("user3") should be (true) + securityManager.checkUIViewPermissions("user4") should be (true) + // Check the "user5" without mapping relation should return false + securityManager.checkUIViewPermissions("user5") should be (false) + } + + // Test neither history ui admin acls nor application acls are configured. + val conf3 = createTestConf() + .set("spark.history.ui.acls.enable", "true") + .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName) + createAndCheck(conf3) { securityManager => + // Test whether user has permission to access UI. + securityManager.checkUIViewPermissions("user1") should be (false) + securityManager.checkUIViewPermissions("user2") should be (false) + securityManager.checkUIViewPermissions("user") should be (false) + + // Test whether user with admin group has permission to access UI. + // Check should be failed since we don't have acl group settings. + securityManager.checkUIViewPermissions("user3") should be (false) + securityManager.checkUIViewPermissions("user4") should be (false) + securityManager.checkUIViewPermissions("user5") should be (false) + } + } + /** * Asks the provider to check for logs and calls a function to perform checks on the updated * app list. Example: @@ -532,3 +629,15 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc } } + +class TestGroupsMappingProvider extends GroupMappingServiceProvider { + private val mappings = Map( + "user3" -> "group1", + "user4" -> "group1", + "user5" -> "group") + + override def getGroups(username: String): Set[String] = { + mappings.get(username).map(Set(_)).getOrElse(Set.empty) + } +} + diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 742500d87d32a..f94baaa30d18d 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -18,9 +18,10 @@ package org.apache.spark.executor import java.nio.ByteBuffer +import java.util.Properties import java.util.concurrent.CountDownLatch -import scala.collection.mutable.HashMap +import scala.collection.mutable.Map import org.mockito.Matchers._ import org.mockito.Mockito.{mock, when} @@ -32,7 +33,7 @@ import org.apache.spark.TaskState.TaskState import org.apache.spark.memory.MemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.rpc.RpcEnv -import org.apache.spark.scheduler.{FakeTask, Task} +import org.apache.spark.scheduler.{FakeTask, TaskDescription} import org.apache.spark.serializer.JavaSerializer class ExecutorSuite extends SparkFunSuite { @@ -52,13 +53,18 @@ class ExecutorSuite extends SparkFunSuite { when(mockEnv.memoryManager).thenReturn(mockMemoryManager) when(mockEnv.closureSerializer).thenReturn(serializer) val fakeTaskMetrics = serializer.newInstance().serialize(TaskMetrics.registered).array() - - val serializedTask = - Task.serializeWithDependencies( - new FakeTask(0, 0, Nil, fakeTaskMetrics), - HashMap[String, Long](), - HashMap[String, Long](), - serializer.newInstance()) + val serializedTask = serializer.newInstance().serialize( + new FakeTask(0, 0, Nil, fakeTaskMetrics)) + val taskDescription = new TaskDescription( + taskId = 0, + attemptNumber = 0, + executorId = "", + name = "", + index = 0, + addedFiles = Map[String, Long](), + addedJars = Map[String, Long](), + properties = new Properties, + serializedTask) // we use latches to force the program to run in this order: // +-----------------------------+---------------------------------------+ @@ -108,7 +114,7 @@ class ExecutorSuite extends SparkFunSuite { try { executor = new Executor("id", "localhost", mockEnv, userClassPath = Nil, isLocal = true) // the task will be launched in a dedicated worker thread - executor.launchTask(mockExecutorBackend, 0, 0, "", serializedTask) + executor.launchTask(mockExecutorBackend, taskDescription) executorSuiteHelper.latch1.await() // we know the task will be started, but not yet deserialized, because of the latches we diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 5e8a854e46a0f..f3d3f701af462 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -1819,7 +1819,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet === HashSet(makeBlockManagerId("hostA"))) - // Reducer should run where RDD 2 has preferences, even though though it also has a shuffle dep + // Reducer should run where RDD 2 has preferences, even though it also has a shuffle dep val reduceTaskSet = taskSets(1) assertLocations(reduceTaskSet, Seq(Seq("hostB"))) complete(reduceTaskSet, Seq((Success, 42))) @@ -2058,7 +2058,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou // Now complete tasks in the second task set val newTaskSet = taskSets(1) - assert(newTaskSet.tasks.size === 2) // Both tasks 0 and 1 were on on hostA + assert(newTaskSet.tasks.size === 2) // Both tasks 0 and 1 were on hostA runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2))) assert(results.size === 0) // Map stage job should not be complete yet runEvent(makeCompletionEvent(newTaskSet.tasks(1), Success, makeMapStatus("hostB", 2))) diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala new file mode 100644 index 0000000000000..9f1fe0515732e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import java.nio.ByteBuffer +import java.util.Properties + +import scala.collection.mutable.HashMap + +import org.apache.spark.SparkFunSuite + +class TaskDescriptionSuite extends SparkFunSuite { + test("encoding and then decoding a TaskDescription results in the same TaskDescription") { + val originalFiles = new HashMap[String, Long]() + originalFiles.put("fileUrl1", 1824) + originalFiles.put("fileUrl2", 2) + + val originalJars = new HashMap[String, Long]() + originalJars.put("jar1", 3) + + val originalProperties = new Properties() + originalProperties.put("property1", "18") + originalProperties.put("property2", "test value") + + // Create a dummy byte buffer for the task. + val taskBuffer = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4)) + + val originalTaskDescription = new TaskDescription( + taskId = 1520589, + attemptNumber = 2, + executorId = "testExecutor", + name = "task for test", + index = 19, + originalFiles, + originalJars, + originalProperties, + taskBuffer + ) + + val serializedTaskDescription = TaskDescription.encode(originalTaskDescription) + val decodedTaskDescription = TaskDescription.decode(serializedTaskDescription) + + // Make sure that all of the fields in the decoded task description match the original. + assert(decodedTaskDescription.taskId === originalTaskDescription.taskId) + assert(decodedTaskDescription.attemptNumber === originalTaskDescription.attemptNumber) + assert(decodedTaskDescription.executorId === originalTaskDescription.executorId) + assert(decodedTaskDescription.name === originalTaskDescription.name) + assert(decodedTaskDescription.index === originalTaskDescription.index) + assert(decodedTaskDescription.addedFiles.equals(originalFiles)) + assert(decodedTaskDescription.addedJars.equals(originalJars)) + assert(decodedTaskDescription.properties.equals(originalTaskDescription.properties)) + assert(decodedTaskDescription.serializedTask.equals(taskBuffer)) + } +} diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala index c9e682f53c105..3e55d399e9df9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler -import java.io.File +import java.io.{File, ObjectInputStream} import java.net.URL import java.nio.ByteBuffer @@ -248,5 +248,24 @@ class TaskResultGetterSuite extends SparkFunSuite with BeforeAndAfter with Local assert(resSizeAfter.exists(_.toString.toLong > 0L)) } + test("failed task is handled when error occurs deserializing the reason") { + sc = new SparkContext("local", "test", conf) + val rdd = sc.parallelize(Seq(1), 1).map { _ => + throw new UndeserializableException + } + val message = intercept[SparkException] { + rdd.collect() + }.getMessage + // Job failed, even though the failure reason is unknown. + val unknownFailure = """(?s).*Lost task.*: UnknownReason.*""".r + assert(unknownFailure.findFirstMatchIn(message).isDefined) + } + +} + +private class UndeserializableException extends Exception { + private def readObject(in: ObjectInputStream): Unit = { + throw new NoClassDefFoundError() + } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index fb7b91222b499..442a603cae791 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.util -import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File, FileOutputStream, PrintStream} +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataOutput, DataOutputStream, File, + FileOutputStream, PrintStream} import java.lang.{Double => JDouble, Float => JFloat} import java.net.{BindException, ServerSocket, URI} import java.nio.{ByteBuffer, ByteOrder} @@ -389,6 +390,28 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.deserializeLongValue(bbuf.array) === testval) } + test("writeByteBuffer should not change ByteBuffer position") { + // Test a buffer with an underlying array, for both writeByteBuffer methods. + val testBuffer = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4)) + assert(testBuffer.hasArray) + val bytesOut = new ByteBufferOutputStream(4096) + Utils.writeByteBuffer(testBuffer, bytesOut) + assert(testBuffer.position() === 0) + + val dataOut = new DataOutputStream(bytesOut) + Utils.writeByteBuffer(testBuffer, dataOut: DataOutput) + assert(testBuffer.position() === 0) + + // Test a buffer without an underlying array, for both writeByteBuffer methods. + val testDirectBuffer = ByteBuffer.allocateDirect(8) + assert(!testDirectBuffer.hasArray()) + Utils.writeByteBuffer(testDirectBuffer, bytesOut) + assert(testDirectBuffer.position() === 0) + + Utils.writeByteBuffer(testDirectBuffer, dataOut: DataOutput) + assert(testDirectBuffer.position() === 0) + } + test("get iterator size") { val empty = Seq[Int]() assert(Utils.getIteratorSize(empty.toIterator) === 0L) diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index 7f0838268a111..c8b6a3346a460 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -53,7 +53,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext { conf } - test("single insert insert") { + test("single insert") { val conf = createSparkConf(loadDefaults = false) sc = new SparkContext("local", "test", conf) val map = createExternalMap[Int] diff --git a/docs/configuration.md b/docs/configuration.md index bd67144007e92..d2fdef02df9e0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -59,6 +59,7 @@ The following format is accepted: 1p or 1pb (pebibytes = 1024 tebibytes) ## Dynamically Loading Spark Properties + In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For instance, if you'd like to run the same application with different masters or different amounts of memory. Spark allows you to simply create an empty conf: @@ -106,7 +107,8 @@ line will appear. For all other configuration properties, you can assume the def Most of the properties that control internal settings have reasonable default values. Some of the most common options to set are: -#### Application Properties +### Application Properties +
Property Name | Default | Meaning | |||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
spark.driver.bindAddress |
(value of spark.driver.host) |
- Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP - environment variable (see below). + Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP + environment variable (see below). -It also allows a different address from the local one to be advertised to executors or external systems.
+ |
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning | spark.ssl.enabled |
false |
- Whether to enable SSL connections on all supported protocols. + Whether to enable SSL connections on all supported protocols. -When When spark.ssl.enabled is configured, spark.ssl.protocol
+ is required.
- All the SSL settings like Use |
---|---|---|
SET -v
command will show the entire list of the SQL configuration.
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|---|---|
Property Name | Default | Meaning |
---|
/applications/[app-id]/stages/[stage-id]/[stage-attempt-id]
/applications/[app-id]/stages/[stage-id]/[stage-attempt-id]/taskSummary
/applications/[base-app-id]/[attempt-id]/logs
/applications/[app-id]/streaming/statistics
/applications/[app-id]/streaming/receivers
/applications/[app-id]/streaming/receivers/[stream-id]
/applications/[app-id]/streaming/batches
/applications/[app-id]/streaming/batches/[batch-id]
/applications/[app-id]/streaming/batches/[batch-id]/operations
/applications/[app-id]/streaming/batches/[batch-id]/operations/[outputOp-id]
Property Name | Meaning |
---|---|
fileFormat |
+ + A fileFormat is kind of a package of storage format specifications, including "serde", "input format" and + "output format". Currently we support 6 fileFormats: 'sequencefile', 'rcfile', 'orc', 'parquet', 'textfile' and 'avro'. + | +
inputFormat, outputFormat |
+ + These 2 options specify the name of a corresponding `InputFormat` and `OutputFormat` class as a string literal, + e.g. `org.apache.hadoop.hive.ql.io.orc.OrcInputFormat`. These 2 options must be appeared in pair, and you can not + specify them if you already specified the `fileFormat` option. + | +
serde |
+ + This option specifies the name of a serde class. When the `fileFormat` option is specified, do not specify this option + if the given `fileFormat` already include the information of serde. Currently "sequencefile", "textfile" and "rcfile" + don't include the serde information and you can use this option with these 3 fileFormats. + | +
fieldDelim, escapeDelim, collectionDelim, mapkeyDelim, lineDelim |
+ + These options can only be used with "textfile" fileFormat. They define how to read delimited files into rows. + | +
Source | +Options | +Fault-tolerant | +Notes | +
---|---|---|---|
File source | +
+ path : path to the input directory, and common to all file formats.
+ + For file-format-specific options, see the related methods in DataStreamReader
+ (Scala/Java/Python).
+ E.g. for "parquet" format options see DataStreamReader.parquet() |
+ Yes | +Supports glob paths, but does not support multiple comma-separated paths/globs. | +
Socket Source | +
+ host : host to connect to, must be specified+ port : port to connect to, must be specified
+ |
+ No | ++ |
Kafka Source | ++ See the Kafka Integration Guide. + | +Yes | ++ |
+ | + | + | + |
Notes | ||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Queries without aggregation |
- Append | -- Complete mode note supported as it is infeasible to keep all data in the Result Table. + | Queries without aggregation | +Append | ++ Complete mode not supported as it is infeasible to keep all data in the Result Table. | |||||||||||||||||||||||||||||
Queries with aggregation | -Aggregation on event-time with watermark | -Append, Complete | +Queries with aggregation | +Aggregation on event-time with watermark | +Append, Update, Complete |
Append mode uses watermark to drop old aggregation state. But the output of a
windowed aggregation is delayed the late threshold specified in `withWatermark()` as by
the modes semantics, rows can be added to the Result Table only once after they are
- finalized (i.e. after watermark is crossed). See
- Late Data section for more details.
+ finalized (i.e. after watermark is crossed). See the
+ Late Data section for more details.
+ + Update mode uses watermark to drop old aggregation state. Complete mode does drop not old aggregation state since by definition this mode preserves all data in the Result Table. |
||||||||||||||||||||||||||||
Other aggregations | -Complete | +Other aggregations | +Complete, Update |
+ Since no watermark is defined (only defined in other category),
+ old aggregation state is not dropped.
+ Append mode is not supported as aggregates can update thus violating the semantics of this mode. - - Complete mode does drop not old aggregation state since by definition this mode - preserves all data in the Result Table. |
||||||||||||||||||||||||||||||
Sink | Supported Output Modes | -Usage | +Options | Fault-tolerant | Notes | ||
---|---|---|---|---|---|---|---|
File Sink | Append | -writeStream |
+
+ path : path to the output directory, must be specified.
+ maxFilesPerTrigger : maximum number of new files to be considered in every trigger (default: no max)
+ + latestFirst : whether to processs the latest new files first, useful when there is a large backlog of files(default: false)
+ + For file-format-specific options, see the related methods in DataFrameWriter + (Scala/Java/Python). + E.g. for "parquet" format options see DataFrameWriter.parquet()
+ |
Yes | Supports writes to partitioned tables. Partitioning by time may be useful. | ||
Foreach Sink | -All modes | -writeStream |
+ Append, Update, Compelete | +None | Depends on ForeachWriter implementation | More details in the next section | |
Console Sink | -Append, Complete | -writeStream |
+ Append, Update, Complete | +
+ numRows : Number of rows to print every trigger (default: 20)
+ + truncate : Whether to truncate the output if too long (default: true)
+ |
No | ||
Memory Sink | Append, Complete | -writeStream |
- No | -Saves the output data as a table, for interactive querying. Table name is the query name. | +None | +No. But in Complete Mode, restarted query will recreate the full table. | +Table name is the query name. |
@@ -1007,7 +1121,7 @@ Here is a table of all the sinks, and the corresponding settings. |