diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a8f5edf26d2b3..cee0b2a2bd7b2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -493,7 +493,8 @@ jobs: - name: Install dependencies for documentation generation run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. - apt-get install -y libcurl4-openssl-dev pandoc + apt-get install -y libcurl4-openssl-dev pandoc libfontconfig1-dev libharfbuzz-dev \ + libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. @@ -503,6 +504,8 @@ jobs: apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" gem install bundler cd docs bundle install diff --git a/R/create-docs.sh b/R/create-docs.sh index ce0fb48b9ff27..1774d5870de5a 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -24,6 +24,8 @@ # $SPARK_HOME/R/pkg/html # The vignettes can be found in # $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html +# pkgdown website can be found in +# $SPARK_HOME/R/pkg/docs set -o pipefail set -e @@ -51,6 +53,18 @@ pushd pkg/html "$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); knitr::knit_rd("SparkR", links = tools::findHTMLlinks(file.path(libDir, "SparkR")))' + +# Determine Spark(R) version +SPARK_VERSION=$(grep -oP "(?<=Version:\ ).*" ../DESCRIPTION) + +# Update url +sed "s/{SPARK_VERSION}/$SPARK_VERSION/" ../pkgdown/_pkgdown_template.yml > ../_pkgdown.yml + +"$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); pkgdown::build_site("..")' + +# Clean temporary config +rm ../_pkgdown.yml + popd popd diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore index 18b2db69db8f1..2bfcda66f3e0d 100644 --- a/R/pkg/.Rbuildignore +++ b/R/pkg/.Rbuildignore @@ -7,3 +7,6 @@ ^src-native$ ^html$ ^tests/fulltests/* +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/R/pkg/.gitignore b/R/pkg/.gitignore new file mode 100644 index 0000000000000..d8f8d46921aa8 --- /dev/null +++ b/R/pkg/.gitignore @@ -0,0 +1 @@ +docs diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index e37509ad488c0..db616626f8fc8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -890,10 +890,9 @@ setMethod("toJSON", #' save mode (it is 'error' by default) #' @param ... additional argument(s) passed to the method. #' You can find the JSON-specific options for writing JSON files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{ -#' Data Source Option} in the version you use. -#' +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @family SparkDataFrame functions #' @rdname write.json #' @name write.json @@ -925,10 +924,9 @@ setMethod("write.json", #' save mode (it is 'error' by default) #' @param ... additional argument(s) passed to the method. #' You can find the ORC-specific options for writing ORC files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{ -#' Data Source Option} in the version you use. -#' +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @family SparkDataFrame functions #' @aliases write.orc,SparkDataFrame,character-method #' @rdname write.orc @@ -960,10 +958,9 @@ setMethod("write.orc", #' save mode (it is 'error' by default) #' @param ... additional argument(s) passed to the method. #' You can find the Parquet-specific options for writing Parquet files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option -#' }{Data Source Option} in the version you use. -#' +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @family SparkDataFrame functions #' @rdname write.parquet #' @name write.parquet @@ -996,10 +993,9 @@ setMethod("write.parquet", #' save mode (it is 'error' by default) #' @param ... additional argument(s) passed to the method. #' You can find the text-specific options for writing text files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{ -#' Data Source Option} in the version you use. -#' +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @family SparkDataFrame functions #' @aliases write.text,SparkDataFrame,character-method #' @rdname write.text @@ -3912,8 +3908,7 @@ setMethod("isStreaming", #' @aliases write.stream,SparkDataFrame-method #' @rdname write.stream #' @name write.stream -#' @examples -#'\dontrun{ +#' @examples \dontrun{ #' sparkR.session() #' df <- read.stream("socket", host = "localhost", port = 9999) #' isStreaming(df) diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index d86e38bf4b3b8..5adebade8b7eb 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -382,9 +382,9 @@ setMethod("toDF", signature(x = "RDD"), #' @param path Path of file to read. A vector of multiple paths is allowed. #' @param ... additional external data source specific named properties. #' You can find the JSON-specific options for reading JSON files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @return SparkDataFrame #' @rdname read.json #' @examples @@ -414,9 +414,9 @@ read.json <- function(path, ...) { #' @param path Path of file to read. #' @param ... additional external data source specific named properties. #' You can find the ORC-specific options for reading ORC files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @return SparkDataFrame #' @rdname read.orc #' @name read.orc @@ -439,9 +439,9 @@ read.orc <- function(path, ...) { #' @param path path of file to read. A vector of multiple paths is allowed. #' @param ... additional data source specific named properties. #' You can find the Parquet-specific options for reading Parquet files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option -#' }{Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @return SparkDataFrame #' @rdname read.parquet #' @name read.parquet @@ -468,9 +468,9 @@ read.parquet <- function(path, ...) { #' @param path Path of file to read. A vector of multiple paths is allowed. #' @param ... additional external data source specific named properties. #' You can find the text-specific options for reading text files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' @return SparkDataFrame #' @rdname read.text #' @examples @@ -619,8 +619,9 @@ loadDF <- function(path = NULL, source = NULL, schema = NULL, ...) { #' #' Additional JDBC database connection properties can be set (...) #' You can find the JDBC-specific option and parameter documentation for reading tables via JDBC in -#' \url{https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option}{Data Source Option} in the version you use. +# nolint end #' #' Only one of partitionColumn or predicates should be set. Partitions of the table will be #' retrieved in parallel based on the \code{numPartitions} or by the predicates. diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 48d4fe81c8c87..1377f0daa7360 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -264,18 +264,20 @@ NULL #' additional named properties to control how it is converted and accepts the #' same options as the JSON data source. #' You can find the JSON-specific options for reading/writing JSON files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option} +# nolint end +#' in the version you use. #' \item \code{to_json}: it supports the "pretty" option which enables pretty #' JSON generation. #' \item \code{to_csv}, \code{from_csv} and \code{schema_of_csv}: this contains #' additional named properties to control how it is converted and accepts the #' same options as the CSV data source. #' You can find the CSV-specific options for reading/writing CSV files in -#' \url{ -#' https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option}{ -#' Data Source Option} in the version you use. +# nolint start +#' \url{https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option}{Data Source Option} +# nolint end +#' in the version you use. #' \item \code{arrays_zip}, this contains additional Columns of arrays to be merged. #' \item \code{map_concat}, this contains additional Columns of maps to be unioned. #' } @@ -3816,6 +3818,7 @@ setMethod("row_number", #' Column, for example \code{unresolved_named_lambda_var("a", "b", "c")} #' yields unresolved \code{a.b.c} #' @return Column object wrapping JVM UnresolvedNamedLambdaVariable +#' @keywords internal unresolved_named_lambda_var <- function(...) { jc <- newJObject( "org.apache.spark.sql.Column", @@ -3839,6 +3842,7 @@ unresolved_named_lambda_var <- function(...) { #' @param fun R \code{function} (unary, binary or ternary) #' that transforms \code{Columns} into a \code{Column} #' @return JVM \code{LambdaFunction} object +#' @keywords internal create_lambda <- function(fun) { as_jexpr <- function(x) callJMethod(x@jc, "expr") @@ -3887,6 +3891,7 @@ create_lambda <- function(fun) { #' @param cols list of character or Column objects #' @param funs list of named list(fun = ..., expected_narg = ...) #' @return a \code{Column} representing name applied to cols with funs +#' @keywords internal invoke_higher_order_function <- function(name, cols, funs) { as_jexpr <- function(x) { if (class(x) == "character") { diff --git a/R/pkg/R/jobj.R b/R/pkg/R/jobj.R index 4905e1fe5c61f..5c2cdddcfa36a 100644 --- a/R/pkg/R/jobj.R +++ b/R/pkg/R/jobj.R @@ -72,6 +72,7 @@ jobj <- function(objId) { #' @param x The JVM object reference #' @param ... further arguments passed to or from other methods #' @note print.jobj since 1.4.0 +#' @keywords internal print.jobj <- function(x, ...) { name <- getClassName.jobj(x) cat("Java ref type", name, "id", x$id, "\n", sep = " ") diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 7044ede0cc58b..4c83e2e450cb6 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -95,6 +95,7 @@ structType.character <- function(x, ...) { #' @param x A StructType object #' @param ... further arguments passed to or from other methods #' @note print.structType since 1.4.0 +#' @keywords internal print.structType <- function(x, ...) { cat("StructType\n", sapply(x$fields(), @@ -234,6 +235,7 @@ structField.character <- function(x, type, nullable = TRUE, ...) { #' @param x A StructField object #' @param ... further arguments passed to or from other methods #' @note print.structField since 1.4.0 +#' @keywords internal print.structField <- function(x, ...) { cat("StructField(name = \"", x$name(), "\", type = \"", x$dataType.toString(), diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 264cbfc9ba929..ca8f8defdfdec 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -115,6 +115,7 @@ isRDD <- function(name, env) { #' hashCode("1") # 49 #'} #' @note hashCode since 1.4.0 +#' @keywords internal hashCode <- function(key) { if (class(key) == "integer") { as.integer(key[[1]]) diff --git a/R/README.md b/R/pkg/README.md similarity index 100% rename from R/README.md rename to R/pkg/README.md diff --git a/R/pkg/pkgdown/_pkgdown_template.yml b/R/pkg/pkgdown/_pkgdown_template.yml new file mode 100644 index 0000000000000..674606f5b5cdf --- /dev/null +++ b/R/pkg/pkgdown/_pkgdown_template.yml @@ -0,0 +1,311 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +url: https://spark.apache.org/docs/{SPARK_VERSION}/api/R/ + +template: + package: preferably + bootstrap: 5 + params: + toggle: manual + includes: + before_title: | + + + + +authors: + Shivaram Venkataraman : + href: https://github.com/shivaram + Xiangrui Meng: + href: https://github.com/mengxr + Felix Cheung: + href: https://github.com/felixcheung + " The Apache Software Foundation": + href: "https://www.apache.org/" + + +reference: + +- title: "Distributed Data Frame" +- contents: + - SparkDataFrame-class + - GroupedData-class + - agg + - arrange + - approxQuantile + - as.data.frame + - attach,SparkDataFrame-method + - broadcast + - cache + - cacheTable + - checkpoint + - collect + - coltypes + - colnames + - count + - createDataFrame + - createExternalTable + - createOrReplaceTempView + - createTable + - crossJoin + - crosstab + - cube + - describe + - distinct + - dim + - drop + - dropDuplicates + - dropna + - dtypes + - except + - exceptAll + - explain + - filter + - getNumPartitions + - group_by + - head + - hint + - histogram + - insertInto + - intersect + - intersectAll + - isLocal + - isStreaming + - join + - limit + - localCheckpoint + - merge + - mutate + - ncol + - nrow + - orderBy + - persist + - pivot + - printSchema + - randomSplit + - rbind + - rename + - registerTempTable + - repartition + - repartitionByRange + - rollup + - sample + - sampleBy + - saveAsTable + - schema + - select + - selectExpr + - show + - showDF + - str + - storageLevel + - subset + - summary + - take + - tableToDF + - toJSON + - union + - unionAll + - unionByName + - unpersist + - with + - withColumn + +- title: "Data import and export" +- contents: + - read.df + - read.jdbc + - read.json + - read.orc + - read.parquet + - read.text + - write.df + - write.jdbc + - write.json + - write.orc + - write.parquet + - write.text + +- title: "Column functions" +- contents: + - column_aggregate_functions + - column_avro_functions + - column_collection_functions + - column_datetime_diff_functions + - column_math_functions + - column_misc_functions + - column_ml_functions + - column_nonaggregate_functions + - column_string_functions + - column_window_functions + - alias + - asc + - avg + - between + - cast + - column + - coalesce + - corr + - cov + - dropFields + - endsWith + - first + - last + - not + - otherwise + - startsWith + - substr + - timestamp_seconds + - withField + - over + - predict + - partitionBy + - rangeBetween + - rowsBetween + - windowOrderBy + - windowPartitionBy + - WindowSpec-class + - "%in%" + - "%<=>%" + +- title: "Schema Definitions" +- contents: + - structField + - structType + +- title: "Structured Streaming" +- contents: + - StreamingQuery-class + - awaitTermination + - isActive + - queryName + - lastProgress + - read.stream + - status + - stopQuery + - withWatermark + - write.stream + +- title: "Spark MLlib" + desc: "MLlib is Spark’s machine learning (ML) library" +- contents: + - AFTSurvivalRegressionModel-class + - ALSModel-class + - BisectingKMeansModel-class + - DecisionTreeClassificationModel-class + - DecisionTreeRegressionModel-class + - FMClassificationModel-class + - FMRegressionModel-class + - FPGrowthModel-class + - GBTClassificationModel-class + - GBTRegressionModel-class + - GaussianMixtureModel-class + - GeneralizedLinearRegressionModel-class + - glm,formula,ANY,SparkDataFrame-method + - IsotonicRegressionModel-class + - KMeansModel-class + - KSTest-class + - LDAModel-class + - LinearRegressionModel-class + - LinearSVCModel-class + - LogisticRegressionModel-class + - MultilayerPerceptronClassificationModel-class + - NaiveBayesModel-class + - PowerIterationClustering-class + - PrefixSpan-class + - RandomForestClassificationModel-class + - RandomForestRegressionModel-class + - fitted + - freqItems + - spark.als + - spark.bisectingKmeans + - spark.decisionTree + - spark.fmClassifier + - spark.fmRegressor + - spark.fpGrowth + - spark.gaussianMixture + - spark.gbt + - spark.glm + - spark.isoreg + - spark.kmeans + - spark.kstest + - spark.lda + - spark.lm + - spark.logit + - spark.mlp + - spark.naiveBayes + - spark.assignClusters + - spark.findFrequentSequentialPatterns + - spark.randomForest + - spark.survreg + - spark.svmLinear + - read.ml + - write.ml + +- title: "Distributed R" +- contents: + - dapply + - dapplyCollect + - gapply + - gapplyCollect + - spark.lapply + +- title: "SQL Catalog" +- contents: + - currentDatabase + - dropTempTable + - dropTempView + - listColumns + - listDatabases + - listFunctions + - listTables + - refreshByPath + - refreshTable + - recoverPartitions + - tableNames + - tables + - uncacheTable + +- title: "Spark Session and Context" +- contents: + - cancelJobGroup + - clearCache + - clearJobGroup + - getLocalProperty + - install.spark + - setCheckpointDir + - setCurrentDatabase + - setJobDescription + - setJobGroup + - setLocalProperty + - setLogLevel + - spark.addFile + - spark.getSparkFiles + - spark.getSparkFilesRootDirectory + - sparkR.conf + - sparkR.callJMethod + - sparkR.callJStatic + - sparkR.init + - sparkR.newJObject + - sparkR.session + - sparkR.session.stop + - sparkR.uiWebUrl + - sparkR.version + - sparkRHive.init + - sparkRSQL.init + - sql diff --git a/R/pkg/pkgdown/extra.css b/R/pkg/pkgdown/extra.css new file mode 100644 index 0000000000000..997789bf25ccb --- /dev/null +++ b/R/pkg/pkgdown/extra.css @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +body, p, a { + font-family: -apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji; + font-weight: 400; +} + +h1 { + margin-top: 3rem !important; + font-size: 2.25rem !important; +} + +h2 { + font-size: 2rem !important; +} + +aside h2 { + margin-top: 2rem !important; + font-size: 1.25rem !important; +} + +h3 { + font-size: 1.75rem !important; +} + +.navbar-brand { + padding-top: .3125rem; + padding-bottom: .3125rem; + margin-right: 1rem; + font-size: 1.25rem; + text-decoration: none; + white-space: nowrap; +} diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 0ed0028eb5173..1f3dd13353ffe 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -27,7 +27,15 @@ vignette: > limitations under the License. --> -```{r setup, include=FALSE} +```{r dynamic-chunk-options, include=FALSE} +# In GitHub lint job, we don't have full JVM build +# SparkR vignette fails to evaluate +GITHUB_ACTIONS <- tolower(Sys.getenv("GITHUB_ACTIONS")) == "true" +EVAL_CHUNK <- !GITHUB_ACTIONS +``` + + +```{r setup, include=FALSE, eval=EVAL_CHUNK} library(knitr) opts_hooks$set(eval = function(options) { # override eval to FALSE only on windows @@ -53,11 +61,11 @@ SparkR is an R package that provides a light-weight frontend to use Apache Spark We begin with an example running on the local machine and provide an overview of the use of SparkR: data ingestion, data processing and machine learning. First, let's load and attach the package. -```{r, message=FALSE} +```{r, message=FALSE, eval=EVAL_CHUNK} library(SparkR) ``` -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} # disable eval if java version not supported override_eval <- tryCatch(!is.numeric(SparkR:::checkJavaVersion()), error = function(e) { TRUE }, @@ -75,11 +83,12 @@ if (override_eval) { We use default settings in which it runs in local mode. It auto downloads Spark package in the background if no previous installation is found. For more details about setup, see [Spark Session](#SetupSparkSession). -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} install.spark() sparkR.session(master = "local[1]", sparkConfig = sparkSessionConfig, enableHiveSupport = FALSE) ``` -```{r, eval=FALSE} + +```{r, eval=EVAL_CHUNK} sparkR.session() ``` @@ -87,18 +96,18 @@ The operations in SparkR are centered around an R class called `SparkDataFrame`. `SparkDataFrame` can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing local R data frames. For example, we create a `SparkDataFrame` from a local R data frame, -```{r} +```{r, eval=EVAL_CHUNK} cars <- cbind(model = rownames(mtcars), mtcars) carsDF <- createDataFrame(cars) ``` We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` function. -```{r} +```{r, eval=EVAL_CHUNK} head(carsDF) ``` Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`. -```{r} +```{r, eval=EVAL_CHUNK} carsSubDF <- select(carsDF, "model", "mpg", "hp") carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200) head(carsSubDF) @@ -106,13 +115,13 @@ head(carsSubDF) SparkR can use many common aggregation functions after grouping. -```{r} +```{r, eval=EVAL_CHUNK} carsGPDF <- summarize(groupBy(carsDF, carsDF$gear), count = n(carsDF$gear)) head(carsGPDF) ``` The results `carsDF` and `carsSubDF` are `SparkDataFrame` objects. To convert back to R `data.frame`, we can use `collect`. **Caution**: This can cause your interactive environment to run out of memory, though, because `collect()` fetches the entire distributed `DataFrame` to your client, which is acting as a Spark driver. -```{r} +```{r, eval=EVAL_CHUNK} carsGP <- collect(carsGPDF) class(carsGP) ``` @@ -120,13 +129,13 @@ class(carsGP) SparkR supports a number of commonly used machine learning algorithms. Under the hood, SparkR uses MLlib to train the model. Users can call `summary` to print a summary of the fitted model, `predict` to make predictions on new data, and `write.ml`/`read.ml` to save/load fitted models. SparkR supports a subset of R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. We use linear regression as an example. -```{r} +```{r, eval=EVAL_CHUNK} model <- spark.glm(carsDF, mpg ~ wt + cyl) ``` The result matches that returned by R `glm` function applied to the corresponding `data.frame` `mtcars` of `carsDF`. In fact, for Generalized Linear Model, we specifically expose `glm` for `SparkDataFrame` as well so that the above is equivalent to `model <- glm(mpg ~ wt + cyl, data = carsDF)`. -```{r} +```{r, eval=EVAL_CHUNK} summary(model) ``` @@ -186,7 +195,7 @@ sparkR.session(spark.sql.warehouse.dir = spark_warehouse_path) SparkR can connect to remote Spark clusters. [Cluster Mode Overview](https://spark.apache.org/docs/latest/cluster-overview.html) is a good introduction to different Spark cluster modes. When connecting SparkR to a remote Spark cluster, make sure that the Spark version and Hadoop version on the machine match the corresponding versions on the cluster. Current SparkR package is compatible with -```{r, echo=FALSE, tidy = TRUE} +```{r, echo=FALSE, tidy = TRUE, eval=EVAL_CHUNK} paste("Spark", packageVersion("SparkR")) ``` It should be used both on the local computer and on the remote cluster. @@ -208,7 +217,7 @@ Yarn cluster mode is not supported in the current version. ### Local Data Frame The simplest way is to convert a local R data frame into a `SparkDataFrame`. Specifically we can use `as.DataFrame` or `createDataFrame` and pass in the local R data frame to create a `SparkDataFrame`. As an example, the following creates a `SparkDataFrame` based using the `faithful` dataset from R. -```{r} +```{r, eval=EVAL_CHUNK} df <- as.DataFrame(faithful) head(df) ``` @@ -231,7 +240,7 @@ The data sources API natively supports JSON formatted input files. Note that the Let's take a look at the first two lines of the raw JSON file used here. -```{r} +```{r, eval=EVAL_CHUNK} filePath <- paste0(sparkR.conf("spark.home"), "/examples/src/main/resources/people.json") readLines(filePath, n = 2L) @@ -239,19 +248,19 @@ readLines(filePath, n = 2L) We use `read.df` to read that into a `SparkDataFrame`. -```{r} +```{r, eval=EVAL_CHUNK} people <- read.df(filePath, "json") count(people) head(people) ``` SparkR automatically infers the schema from the JSON file. -```{r} +```{r, eval=EVAL_CHUNK} printSchema(people) ``` If we want to read multiple JSON files, `read.json` can be used. -```{r} +```{r, eval=EVAL_CHUNK} people <- read.json(paste0(Sys.getenv("SPARK_HOME"), c("/examples/src/main/resources/people.json", "/examples/src/main/resources/people.json"))) @@ -298,12 +307,12 @@ dplyr | SparkR Other differences will be mentioned in the specific methods. We use the `SparkDataFrame` `carsDF` created above. We can get basic information about the `SparkDataFrame`. -```{r} +```{r, eval=EVAL_CHUNK} carsDF ``` Print out the schema in tree format. -```{r} +```{r, eval=EVAL_CHUNK} printSchema(carsDF) ``` @@ -314,12 +323,12 @@ printSchema(carsDF) SparkDataFrames support a number of functions to do structured data processing. Here we include some basic examples and a complete list can be found in the [API](https://spark.apache.org/docs/latest/api/R/index.html) docs: You can also pass in column name as strings. -```{r} +```{r, eval=EVAL_CHUNK} head(select(carsDF, "mpg")) ``` Filter the SparkDataFrame to only retain rows with mpg less than 20 miles/gallon. -```{r} +```{r, eval=EVAL_CHUNK} head(filter(carsDF, carsDF$mpg < 20)) ``` @@ -335,20 +344,20 @@ A number of widely used functions are supported to aggregate data after grouping For example we can compute a histogram of the number of cylinders in the `mtcars` dataset as shown below. -```{r} +```{r, eval=EVAL_CHUNK} numCyl <- summarize(groupBy(carsDF, carsDF$cyl), count = n(carsDF$cyl)) head(numCyl) ``` Use `cube` or `rollup` to compute subtotals across multiple dimensions. -```{r} +```{r, eval=EVAL_CHUNK} mean(cube(carsDF, "cyl", "gear", "am"), "mpg") ``` generates groupings for {(`cyl`, `gear`, `am`), (`cyl`, `gear`), (`cyl`), ()}, while -```{r} +```{r, eval=EVAL_CHUNK} mean(rollup(carsDF, "cyl", "gear", "am"), "mpg") ``` @@ -359,7 +368,7 @@ generates groupings for all possible combinations of grouping columns. SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions. -```{r} +```{r, eval=EVAL_CHUNK} carsDF_km <- carsDF carsDF_km$kmpg <- carsDF_km$mpg * 1.61 head(select(carsDF_km, "model", "mpg", "kmpg")) @@ -378,7 +387,7 @@ Formally, the *group* mentioned above is called the *frame*. Every input row can Window functions are often used in conjunction with the following functions: `windowPartitionBy`, `windowOrderBy`, `partitionBy`, `orderBy`, `over`. To illustrate this we next look at an example. We still use the `mtcars` dataset. The corresponding `SparkDataFrame` is `carsDF`. Suppose for each number of cylinders, we want to calculate the rank of each car in `mpg` within the group. -```{r} +```{r, eval=EVAL_CHUNK} carsSubDF <- select(carsDF, "model", "mpg", "cyl") ws <- orderBy(windowPartitionBy("cyl"), "mpg") carsRank <- withColumn(carsSubDF, "rank", over(rank(), ws)) @@ -403,7 +412,7 @@ In SparkR, we support several kinds of user-defined functions (UDFs). We convert `mpg` to `kmpg` (kilometers per gallon). `carsSubDF` is a `SparkDataFrame` with a subset of `carsDF` columns. -```{r} +```{r, eval=EVAL_CHUNK} carsSubDF <- select(carsDF, "model", "mpg") schema <- "model STRING, mpg DOUBLE, kmpg DOUBLE" out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema) @@ -412,7 +421,7 @@ head(collect(out)) Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. -```{r} +```{r, eval=EVAL_CHUNK} out <- dapplyCollect( carsSubDF, function(x) { @@ -424,7 +433,7 @@ head(out, 3) #### Apply by Group `gapply` can apply a function to each group of a `SparkDataFrame`. The function is to be applied to each group of the `SparkDataFrame` and should have only two parameters: grouping key and R `data.frame` corresponding to that key. The groups are chosen from `SparkDataFrames` column(s). The output of function should be a `data.frame`. Schema specifies the row format of the resulting `SparkDataFrame`. It must represent R function’s output schema on the basis of Spark data types. The column names of the returned `data.frame` are set by user. See [here](#DataTypes) for mapping between R and Spark. -```{r} +```{r, eval=EVAL_CHUNK} schema <- structType(structField("cyl", "double"), structField("max_mpg", "double")) result <- gapply( carsDF, @@ -438,7 +447,7 @@ head(arrange(result, "max_mpg", decreasing = TRUE)) Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. -```{r} +```{r, eval=EVAL_CHUNK} result <- gapplyCollect( carsDF, "cyl", @@ -456,7 +465,7 @@ Similar to `lapply` in native R, `spark.lapply` runs a function over a list of e We use `svm` in package `e1071` as an example. We use all default settings except for varying costs of constraints violation. `spark.lapply` can train those different models in parallel. -```{r} +```{r, eval=EVAL_CHUNK} costs <- exp(seq(from = log(1), to = log(1000), length.out = 5)) train <- function(cost) { stopifnot(requireNamespace("e1071", quietly = TRUE)) @@ -466,24 +475,24 @@ train <- function(cost) { ``` Return a list of model's summaries. -```{r} +```{r, eval=EVAL_CHUNK} model.summaries <- spark.lapply(costs, train) ``` -```{r} +```{r, eval=EVAL_CHUNK} class(model.summaries) ``` To avoid lengthy display, we only present the partial result of the second fitted model. You are free to inspect other models as well. -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} ops <- options() options(max.print=40) ``` -```{r} +```{r, eval=EVAL_CHUNK} print(model.summaries[[2]]) ``` -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} options(ops) ``` @@ -491,19 +500,19 @@ options(ops) ### SQL Queries A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. -```{r} +```{r, eval=EVAL_CHUNK} people <- read.df(paste0(sparkR.conf("spark.home"), "/examples/src/main/resources/people.json"), "json") ``` Register this `SparkDataFrame` as a temporary view. -```{r} +```{r, eval=EVAL_CHUNK} createOrReplaceTempView(people, "people") ``` SQL statements can be run using the sql method. -```{r} +```{r, eval=EVAL_CHUNK} teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") head(teenagers) ``` @@ -577,18 +586,18 @@ For most above, SparkR supports **R formula operators**, including `~`, `.`, `:` ### Training and Test Sets We can easily split `SparkDataFrame` into random training and test sets by the `randomSplit` function. It returns a list of split `SparkDataFrames` with provided `weights`. We use `carsDF` as an example and want to have about $70%$ training data and $30%$ test data. -```{r} +```{r, eval=EVAL_CHUNK} splitDF_list <- randomSplit(carsDF, c(0.7, 0.3), seed = 0) carsDF_train <- splitDF_list[[1]] carsDF_test <- splitDF_list[[2]] ``` -```{r} +```{r, eval=EVAL_CHUNK} count(carsDF_train) head(carsDF_train) ``` -```{r} +```{r, eval=EVAL_CHUNK} count(carsDF_test) head(carsDF_test) ``` @@ -601,7 +610,7 @@ head(carsDF_test) This is a binary classifier. We use a simple example to show how to use `spark.svmLinear` for binary classification. -```{r} +```{r, eval=EVAL_CHUNK} # load training data and create a DataFrame t <- as.data.frame(Titanic) training <- createDataFrame(t) @@ -611,7 +620,7 @@ summary(model) ``` Predict values on training data -```{r} +```{r, eval=EVAL_CHUNK} prediction <- predict(model, training) head(select(prediction, "Class", "Sex", "Age", "Freq", "Survived", "prediction")) ``` @@ -627,7 +636,7 @@ We use a simple example to demonstrate `spark.logit` usage. In general, there ar and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`. Binomial logistic regression -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) model <- spark.logit(training, Survived ~ ., regParam = 0.04741301) @@ -635,13 +644,13 @@ summary(model) ``` Predict values on training data -```{r} +```{r, eval=EVAL_CHUNK} fitted <- predict(model, training) head(select(fitted, "Class", "Sex", "Age", "Freq", "Survived", "prediction")) ``` Multinomial logistic regression against three classes -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional. @@ -673,7 +682,7 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu `spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format. We use Titanic data set to show how to use `spark.mlp` in classification. -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) # fit a Multilayer Perceptron Classification Model @@ -681,18 +690,18 @@ model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2 ``` To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell. -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} ops <- options() options(max.print=5) ``` -```{r} +```{r, eval=EVAL_CHUNK} # check the summary of the fitted model summary(model) ``` -```{r, include=FALSE} +```{r, include=FALSE, eval=EVAL_CHUNK} options(ops) ``` -```{r} +```{r, eval=EVAL_CHUNK} # make predictions use the fitted model predictions <- predict(model, training) head(select(predictions, predictions$prediction)) @@ -702,7 +711,7 @@ head(select(predictions, predictions$prediction)) Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification. -```{r} +```{r, eval=EVAL_CHUNK} titanic <- as.data.frame(Titanic) titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5]) naiveBayesModel <- spark.naiveBayes(titanicDF, Survived ~ Class + Sex + Age) @@ -718,7 +727,7 @@ Factorization Machines for classification problems. For background and details about the implementation of factorization machines, refer to the [Factorization Machines section](https://spark.apache.org/docs/latest/ml-classification-regression.html#factorization-machines). -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) @@ -735,7 +744,7 @@ Survival analysis studies the expected duration of time until an event happens, Accelerated Failure Time (AFT) model is a parametric survival model for censored data that assumes the effect of a covariate is to accelerate or decelerate the life course of an event by some constant. For more information, refer to the Wikipedia page [AFT Model](https://en.wikipedia.org/wiki/Accelerated_failure_time_model) and the references there. Different from a [Proportional Hazards Model](https://en.wikipedia.org/wiki/Proportional_hazards_model) designed for the same purpose, the AFT model is easier to parallelize because each instance contributes to the objective function independently. -```{r, warning=FALSE} +```{r, warning=FALSE, eval=EVAL_CHUNK} library(survival) ovarianDF <- createDataFrame(ovarian) aftModel <- spark.survreg(ovarianDF, Surv(futime, fustat) ~ ecog_ps + rx) @@ -772,23 +781,23 @@ For more information regarding the families and their link functions, see the Wi We use the `mtcars` dataset as an illustration. The corresponding `SparkDataFrame` is `carsDF`. After fitting the model, we print out a summary and see the fitted values by making predictions on the original dataset. We can also pass into a new `SparkDataFrame` of same schema to predict on new data. -```{r} +```{r, eval=EVAL_CHUNK} gaussianGLM <- spark.glm(carsDF, mpg ~ wt + hp) summary(gaussianGLM) ``` When doing prediction, a new column called `prediction` will be appended. Let's look at only a subset of columns here. -```{r} +```{r, eval=EVAL_CHUNK} gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` The following is the same fit using the tweedie family: -```{r} +```{r, eval=EVAL_CHUNK} tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 0.0) summary(tweedieGLM1) ``` We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link: -```{r} +```{r, eval=EVAL_CHUNK} tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 1.2, link.power = 0.0) summary(tweedieGLM2) @@ -811,7 +820,7 @@ There are a few more arguments that may be useful. We use an artificial example to show the use. -```{r} +```{r, eval=EVAL_CHUNK} y <- c(3.0, 6.0, 8.0, 5.0, 7.0) x <- c(1.0, 2.0, 3.5, 3.0, 4.0) w <- rep(1.0, 5) @@ -832,7 +841,7 @@ In the prediction stage, based on the fitted monotone piecewise function, the ru For example, when the input is $3.2$, the two closest feature values are $3.0$ and $3.5$, then predicted value would be a linear interpolation between the predicted values at $3.0$ and $3.5$. -```{r} +```{r, eval=EVAL_CHUNK} newDF <- createDataFrame(data.frame(x = c(1.5, 3.2))) head(predict(isoregModel, newDF)) ``` @@ -841,7 +850,7 @@ head(predict(isoregModel, newDF)) Linear regression model. -```{r} +```{r, eval=EVAL_CHUNK} model <- spark.lm(carsDF, mpg ~ wt + hp) summary(model) @@ -856,7 +865,7 @@ Factorization Machines for regression problems. For background and details about the implementation of factorization machines, refer to the [Factorization Machines section](https://spark.apache.org/docs/latest/ml-classification-regression.html#factorization-machines). -```{r} +```{r, eval=EVAL_CHUNK} model <- spark.fmRegressor(carsDF, mpg ~ wt + hp) summary(model) predictions <- predict(model, carsDF) @@ -870,7 +879,7 @@ Users can call `summary` to get a summary of the fitted model, `predict` to make We use the `Titanic` dataset to train a decision tree and make predictions: -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) df <- createDataFrame(t) dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2) @@ -886,7 +895,7 @@ Users can call `summary` to get a summary of the fitted model, `predict` to make We use the `Titanic` dataset to train a gradient-boosted tree and make predictions: -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) df <- createDataFrame(t) gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2) @@ -902,7 +911,7 @@ Users can call `summary` to get a summary of the fitted model, `predict` to make In the following example, we use the `Titanic` dataset to train a random forest and make predictions: -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) df <- createDataFrame(t) rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2) @@ -915,7 +924,7 @@ head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction" `spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy. -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4) @@ -929,7 +938,7 @@ head(select(fitted, "Class", "prediction")) `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. We use a simulated example to demonstrate the usage. -```{r} +```{r, eval=EVAL_CHUNK} X1 <- data.frame(V1 = rnorm(4), V2 = rnorm(4)) X2 <- data.frame(V1 = rnorm(6, 3), V2 = rnorm(6, 4)) data <- rbind(X1, X2) @@ -944,7 +953,7 @@ head(select(gmmFitted, "V1", "V2", "prediction")) `spark.kmeans` fits a $k$-means clustering model against a `SparkDataFrame`. As an unsupervised learning method, we don't need a response variable. Hence, the left hand side of the R formula should be left blank. The clustering is based only on the variables on the right hand side. -```{r} +```{r, eval=EVAL_CHUNK} kmeansModel <- spark.kmeans(carsDF, ~ mpg + hp + wt, k = 3) summary(kmeansModel) kmeansPredictions <- predict(kmeansModel, carsDF) @@ -976,7 +985,7 @@ Two more functions are provided for the fitted model. For more information, see the help document `?spark.lda`. Let's look an artificial example. -```{r} +```{r, eval=EVAL_CHUNK} corpus <- data.frame(features = c( "1 2 6 0 2 3 1 1 0 0 3", "1 3 0 1 3 0 0 2 0 0 1", @@ -995,12 +1004,12 @@ model <- spark.lda(data = corpusDF, k = 5, optimizer = "em") summary(model) ``` -```{r} +```{r, eval=EVAL_CHUNK} posterior <- spark.posterior(model, corpusDF) head(posterior) ``` -```{r} +```{r, eval=EVAL_CHUNK} perplexity <- spark.perplexity(model, corpusDF) perplexity ``` @@ -1038,7 +1047,7 @@ head(predicted) Power Iteration Clustering (PIC) is a scalable graph clustering algorithm. `spark.assignClusters` method runs the PIC algorithm and returns a cluster assignment for each input vertex. -```{r} +```{r, eval=EVAL_CHUNK} df <- createDataFrame(list(list(0L, 1L, 1.0), list(0L, 2L, 1.0), list(1L, 2L, 1.0), list(3L, 4L, 1.0), list(4L, 0L, 0.1)), @@ -1050,7 +1059,7 @@ head(spark.assignClusters(df, initMode = "degree", weightCol = "weight")) `spark.fpGrowth` executes FP-growth algorithm to mine frequent itemsets on a `SparkDataFrame`. `itemsCol` should be an array of values. -```{r} +```{r, eval=EVAL_CHUNK} df <- selectExpr(createDataFrame(data.frame(rawItems = c( "T,R,U", "T,S", "V,R", "R,U,T,V", "R,S", "V,S,U", "U,R", "S,T", "V,R", "V,U,S", "T,V,U", "R,V", "T,S", "T,S", "S,T", "S,U", "T,R", "V,R", "S,V", "T,S,U" @@ -1061,19 +1070,19 @@ fpm <- spark.fpGrowth(df, minSupport = 0.2, minConfidence = 0.5) `spark.freqItemsets` method can be used to retrieve a `SparkDataFrame` with the frequent itemsets. -```{r} +```{r, eval=EVAL_CHUNK} head(spark.freqItemsets(fpm)) ``` `spark.associationRules` returns a `SparkDataFrame` with the association rules. -```{r} +```{r, eval=EVAL_CHUNK} head(spark.associationRules(fpm)) ``` We can make predictions based on the `antecedent`. -```{r} +```{r, eval=EVAL_CHUNK} head(predict(fpm, df)) ``` @@ -1081,7 +1090,7 @@ head(predict(fpm, df)) `spark.findFrequentSequentialPatterns` method can be used to find the complete set of frequent sequential patterns in the input sequences of itemsets. -```{r} +```{r, eval=EVAL_CHUNK} df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))), list(list(list(1L), list(3L, 2L), list(1L, 2L))), list(list(list(1L, 2L), list(5L))), @@ -1101,7 +1110,7 @@ In the following example, we test whether the `Titanic` dataset's `Freq` column follows a normal distribution. We set the parameters of the normal distribution using the mean and standard deviation of the sample. -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) df <- createDataFrame(t) freqStats <- head(select(df, mean(df$Freq), sd(df$Freq))) @@ -1116,7 +1125,7 @@ testSummary ### Model Persistence The following example shows how to save/load an ML model in SparkR. -```{r} +```{r, eval=EVAL_CHUNK} t <- as.data.frame(Titanic) training <- createDataFrame(t) gaussianGLM <- spark.glm(training, Freq ~ Sex + Age, family = "gaussian") @@ -1276,10 +1285,10 @@ env | map * [SparkR: Scaling R Programs with Spark](https://people.csail.mit.edu/matei/papers/2016/sigmod_sparkr.pdf), Shivaram Venkataraman, Zongheng Yang, Davies Liu, Eric Liang, Hossein Falaki, Xiangrui Meng, Reynold Xin, Ali Ghodsi, Michael Franklin, Ion Stoica, and Matei Zaharia. SIGMOD 2016. June 2016. -```{r, echo=FALSE} +```{r, echo=FALSE, eval=EVAL_CHUNK} sparkR.session.stop() ``` -```{r cleanup, include=FALSE} +```{r cleanup, include=FALSE, eval=EVAL_CHUNK} SparkR:::uninstallDownloadedSpark() ``` diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 83752bd941d25..ffd60c07af0c4 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -80,8 +80,11 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ $APT_INSTALL r-base r-base-dev && \ $APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev && \ $APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf && \ + $APT_INSTALL libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev && \ Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \ Rscript -e "devtools::install_github('jimhester/lintr')" && \ + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" && \ # Install tools needed to build the documentation. $APT_INSTALL ruby2.7 ruby2.7-dev && \ gem install --no-document $GEM_PKGS diff --git a/docs/README.md b/docs/README.md index d5d04c6d35a9a..5e9a187ea3ab6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -56,6 +56,8 @@ and install these libraries: ```sh $ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="https://cloud.r-project.org/")' $ sudo Rscript -e 'devtools::install_version("roxygen2", version = "7.1.1", repos="https://cloud.r-project.org/")' +$ sudo Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" +$ sudo Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" ``` Note: Other versions of roxygen2 might work in SparkR documentation generation but `RoxygenNote` field in `$SPARK_HOME/R/pkg/DESCRIPTION` is 7.1.1, which is updated if the version is mismatched. diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index 35d5dc97c2506..28d5e0d82c93a 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -152,11 +152,8 @@ puts "Making directory api/R" mkdir_p "api/R" - puts "cp -r ../R/pkg/html/. api/R" - cp_r("../R/pkg/html/.", "api/R") - - puts "cp ../R/pkg/DESCRIPTION api" - cp("../R/pkg/DESCRIPTION", "api") + puts "cp -r ../R/pkg/docs/. api/R" + cp_r("../R/pkg/docs/.", "api/R") end if not (ENV['SKIP_SQLDOC'] == '1')