[SPARK-37474][R][DOCS] Migrate SparkR docs to pkgdown

### What changes were proposed in this pull request? This PR proposes migration of R API docs to [pkgdown](pkgdown.r-lib.org/). Result (not synced automatically) could look like this https://zero323.gitlab.io/sparkr-docs-experiments/ ### Why are the changes needed? To improve overall experience of interactions with SparkR docs: - Arguably, much better looking than the simple R docs. - Fully linked examples allow easy inspection of the used functions. - Adds functional search. - For what is worth, all samples can be copied and pasted. - Dark mode. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Inspection of the build page Closes #34728 from zero323/SPARK-37474. Authored-by: zero323 <[email protected]> Signed-off-by: Sean Owen <[email protected]>
apache · Dec 10, 2021 · 16d1c68 · 16d1c68
1 parent 12d3517
commit 16d1c68
Show file tree

Hide file tree

Showing 17 changed files with 523 additions and 127 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -493,7 +493,8 @@ jobs:
     - name: Install dependencies for documentation generation
       run: |
         # pandoc is required to generate PySpark APIs as well in nbsphinx.
-        apt-get install -y libcurl4-openssl-dev pandoc
+        apt-get install -y libcurl4-openssl-dev pandoc libfontconfig1-dev libharfbuzz-dev \
+          libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         # Jinja2 3.0.0+ causes error when building with Sphinx.
@@ -503,6 +504,8 @@ jobs:
         apt-get update -y
         apt-get install -y ruby ruby-dev
         Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
+        Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
         gem install bundler
         cd docs
         bundle install

diff --git a/R/create-docs.sh b/R/create-docs.sh
@@ -24,6 +24,8 @@
 # $SPARK_HOME/R/pkg/html
 # The vignettes can be found in
 # $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
+# pkgdown website can be found in
+# $SPARK_HOME/R/pkg/docs
 
 set -o pipefail
 set -e
@@ -51,6 +53,18 @@ pushd pkg/html
 
 "$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); knitr::knit_rd("SparkR", links = tools::findHTMLlinks(file.path(libDir, "SparkR")))'
 
+
+# Determine Spark(R) version
+SPARK_VERSION=$(grep -oP "(?<=Version:\ ).*" ../DESCRIPTION)
+
+# Update url
+sed "s/{SPARK_VERSION}/$SPARK_VERSION/" ../pkgdown/_pkgdown_template.yml > ../_pkgdown.yml
+
+"$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); pkgdown::build_site("..")'
+
+# Clean temporary config
+rm ../_pkgdown.yml
+
 popd
 
 popd
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
@@ -7,3 +7,6 @@
 ^src-native$
 ^html$
 ^tests/fulltests/*
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
diff --git a/R/pkg/.gitignore b/R/pkg/.gitignore
@@ -0,0 +1 @@
+docs
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -890,10 +890,9 @@ setMethod("toJSON",
 #'             save mode (it is 'error' by default)
 #' @param ... additional argument(s) passed to the method.
 #'            You can find the JSON-specific options for writing JSON files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{
-#'            Data Source Option} in the version you use.
-#'
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @family SparkDataFrame functions
 #' @rdname write.json
 #' @name write.json
@@ -925,10 +924,9 @@ setMethod("write.json",
 #'             save mode (it is 'error' by default)
 #' @param ... additional argument(s) passed to the method.
 #'            You can find the ORC-specific options for writing ORC files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{
-#'            Data Source Option} in the version you use.
-#'
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @family SparkDataFrame functions
 #' @aliases write.orc,SparkDataFrame,character-method
 #' @rdname write.orc
@@ -960,10 +958,9 @@ setMethod("write.orc",
 #'             save mode (it is 'error' by default)
 #' @param ... additional argument(s) passed to the method.
 #'            You can find the Parquet-specific options for writing Parquet files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option
-#'            }{Data Source Option} in the version you use.
-#'
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @family SparkDataFrame functions
 #' @rdname write.parquet
 #' @name write.parquet
@@ -996,10 +993,9 @@ setMethod("write.parquet",
 #'             save mode (it is 'error' by default)
 #' @param ... additional argument(s) passed to the method.
 #'            You can find the text-specific options for writing text files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{
-#'            Data Source Option} in the version you use.
-#'
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @family SparkDataFrame functions
 #' @aliases write.text,SparkDataFrame,character-method
 #' @rdname write.text
@@ -3912,8 +3908,7 @@ setMethod("isStreaming",
 #' @aliases write.stream,SparkDataFrame-method
 #' @rdname write.stream
 #' @name write.stream
-#' @examples
-#'\dontrun{
+#' @examples \dontrun{
 #' sparkR.session()
 #' df <- read.stream("socket", host = "localhost", port = 9999)
 #' isStreaming(df)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -382,9 +382,9 @@ setMethod("toDF", signature(x = "RDD"),
 #' @param path Path of file to read. A vector of multiple paths is allowed.
 #' @param ... additional external data source specific named properties.
 #'            You can find the JSON-specific options for reading JSON files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{
-#'            Data Source Option} in the version you use.
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @return SparkDataFrame
 #' @rdname read.json
 #' @examples
@@ -414,9 +414,9 @@ read.json <- function(path, ...) {
 #' @param path Path of file to read.
 #' @param ... additional external data source specific named properties.
 #'            You can find the ORC-specific options for reading ORC files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{
-#'            Data Source Option} in the version you use.
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @return SparkDataFrame
 #' @rdname read.orc
 #' @name read.orc
@@ -439,9 +439,9 @@ read.orc <- function(path, ...) {
 #' @param path path of file to read. A vector of multiple paths is allowed.
 #' @param ... additional data source specific named properties.
 #'            You can find the Parquet-specific options for reading Parquet files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option
-#'            }{Data Source Option} in the version you use.
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @return SparkDataFrame
 #' @rdname read.parquet
 #' @name read.parquet
@@ -468,9 +468,9 @@ read.parquet <- function(path, ...) {
 #' @param path Path of file to read. A vector of multiple paths is allowed.
 #' @param ... additional external data source specific named properties.
 #'            You can find the text-specific options for reading text files in
-#'            \url{
-#'            https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{
-#'            Data Source Option} in the version you use.
+# nolint start
+#'            \url{https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #' @return SparkDataFrame
 #' @rdname read.text
 #' @examples
@@ -619,8 +619,9 @@ loadDF <- function(path = NULL, source = NULL, schema = NULL, ...) {
 #'
 #' Additional JDBC database connection properties can be set (...)
 #' You can find the JDBC-specific option and parameter documentation for reading tables via JDBC in
-#' \url{https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option}{
-#' Data Source Option} in the version you use.
+# nolint start
+#' \url{https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option}{Data Source Option} in the version you use.
+# nolint end
 #'
 #' Only one of partitionColumn or predicates should be set. Partitions of the table will be
 #' retrieved in parallel based on the \code{numPartitions} or by the predicates.

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -264,18 +264,20 @@ NULL
 #'              additional named properties to control how it is converted and accepts the
 #'              same options as the JSON data source.
 #'              You can find the JSON-specific options for reading/writing JSON files in
-#'              \url{
-#'              https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{
-#'              Data Source Option} in the version you use.
+# nolint start
+#'              \url{https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option}{Data Source Option}
+# nolint end
+#'              in the version you use.
 #'          \item \code{to_json}: it supports the "pretty" option which enables pretty
 #'              JSON generation.
 #'          \item \code{to_csv}, \code{from_csv} and \code{schema_of_csv}: this contains
 #'              additional named properties to control how it is converted and accepts the
 #'              same options as the CSV data source.
 #'              You can find the CSV-specific options for reading/writing CSV files in
-#'              \url{
-#'              https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option}{
-#'              Data Source Option} in the version you use.
+# nolint start
+#'              \url{https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option}{Data Source Option}
+# nolint end
+#'              in the version you use.
 #'          \item \code{arrays_zip}, this contains additional Columns of arrays to be merged.
 #'          \item \code{map_concat}, this contains additional Columns of maps to be unioned.
 #'          }
@@ -3816,6 +3818,7 @@ setMethod("row_number",
 #'        Column, for example \code{unresolved_named_lambda_var("a", "b", "c")}
 #'        yields unresolved \code{a.b.c}
 #' @return Column object wrapping JVM UnresolvedNamedLambdaVariable
+#' @keywords internal
 unresolved_named_lambda_var <- function(...) {
   jc <- newJObject(
     "org.apache.spark.sql.Column",
@@ -3839,6 +3842,7 @@ unresolved_named_lambda_var <- function(...) {
 #' @param fun R \code{function} (unary, binary or ternary)
 #'        that transforms \code{Columns} into a \code{Column}
 #' @return JVM \code{LambdaFunction} object
+#' @keywords internal
 create_lambda <- function(fun) {
   as_jexpr <- function(x) callJMethod(x@jc, "expr")
 
@@ -3887,6 +3891,7 @@ create_lambda <- function(fun) {
 #' @param cols list of character or Column objects
 #' @param funs list of named list(fun = ..., expected_narg = ...)
 #' @return a \code{Column} representing name applied to cols with funs
+#' @keywords internal
 invoke_higher_order_function <- function(name, cols, funs) {
   as_jexpr <- function(x) {
     if (class(x) == "character") {

diff --git a/R/pkg/R/jobj.R b/R/pkg/R/jobj.R
@@ -72,6 +72,7 @@ jobj <- function(objId) {
 #' @param x The JVM object reference
 #' @param ... further arguments passed to or from other methods
 #' @note print.jobj since 1.4.0
+#' @keywords internal
 print.jobj <- function(x, ...) {
   name <- getClassName.jobj(x)
   cat("Java ref type", name, "id", x$id, "\n", sep = " ")

diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
@@ -95,6 +95,7 @@ structType.character <- function(x, ...) {
 #' @param x A StructType object
 #' @param ... further arguments passed to or from other methods
 #' @note print.structType since 1.4.0
+#' @keywords internal
 print.structType <- function(x, ...) {
   cat("StructType\n",
       sapply(x$fields(),
@@ -234,6 +235,7 @@ structField.character <- function(x, type, nullable = TRUE, ...) {
 #' @param x A StructField object
 #' @param ... further arguments passed to or from other methods
 #' @note print.structField since 1.4.0
+#' @keywords internal
 print.structField <- function(x, ...) {
   cat("StructField(name = \"", x$name(),
       "\", type = \"", x$dataType.toString(),

diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
@@ -115,6 +115,7 @@ isRDD <- function(name, env) {
 #' hashCode("1") # 49
 #'}
 #' @note hashCode since 1.4.0
+#' @keywords internal
 hashCode <- function(key) {
   if (class(key) == "integer") {
     as.integer(key[[1]])

diff --git a/R/README.md → R/pkg/README.md b/R/README.md → R/pkg/README.md