From 691e8e43d4334f9f4a1fa5102f0b70e5f3bd9a0f Mon Sep 17 00:00:00 2001 From: gfinak Date: Thu, 5 Jul 2018 13:38:07 -0700 Subject: [PATCH] This commit and previous should address remaining ropensci reviewer issues. #21 #22 #23 #24 #25 #26 #27 #28 #29 #30 #31 #32 --- DESCRIPTION | 5 +- R/build.R | 6 +- R/processData.R | 27 +- man/DataPackageR.Rd | 4 +- man/package_build.Rd | 5 +- tests/testthat/test-skeleton.R | 10 +- vignettes/usingDataPackageR.R | 120 ++++--- vignettes/usingDataPackageR.Rmd | 5 + vignettes/usingDataPackageR.html | 530 +++++++++++++++++++------------ vignettes/usingDataPackageR.md | 474 ++++++++++++++++----------- 10 files changed, 718 insertions(+), 468 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8bd722d..9bd8ac5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,7 @@ Title: Construct Reproducible Analytic Data Sets as R Packages Authors@R: c(person(given = "Greg Finak", role=c("aut","cre","cph"), email="gfinak@fredhutch.org"), person(given = "Paul Obrecht", role=c("ctb"))) -Version: 0.13.6 +Version: 0.14.0 Description: Construct reproducible analytic data sets as R packages. License: MIT + file LICENSE Depends: R (>= 3.5.0) @@ -28,6 +28,7 @@ RoxygenNote: 6.0.1 Suggests: testthat, covr, - data.tree + data.tree, + mtcars20 URL: https://github.com/RGLab/DataPackageR BugReports: https://github.com/RGLab/DataPackageR/issues diff --git a/R/build.R b/R/build.R index 9ec5355..a41a903 100644 --- a/R/build.R +++ b/R/build.R @@ -7,6 +7,7 @@ #' @param packageName \code{character} path to package source directory. Defaults to the current path when NULL. #' @param vignettes \code{logical} specify whether to build vignettes. Default FALSE. #' @param log log level \code{INFO,WARN,DEBUG,FATAL} +#' @param deps \code{logical} should we pass data objects into subsequent scripts? Default TRUE #' @importFrom roxygen2 roxygenise roxygenize #' @importFrom devtools build_vignettes build parse_deps #' @importFrom usethis use_build_ignore use_rstudio proj_set use_directory @@ -32,7 +33,8 @@ #' package_build(file.path(tempdir(),pname)) package_build <- function(packageName = NULL, vignettes = FALSE, - log=INFO) { + log = INFO, + deps = TRUE) { flog.threshold(log) flog.appender(appender.console()) requireNamespace("rprojroot") @@ -71,7 +73,7 @@ package_build <- function(packageName = NULL, # Return success if we've processed everything success <- - DataPackageR(arg = package_path) + DataPackageR(arg = package_path, deps = deps) ifelse(success, flog.info("DataPackageR succeeded"), flog.warn("DataPackageR failed") diff --git a/R/processData.R b/R/processData.R index cb1ec23..be62d84 100644 --- a/R/processData.R +++ b/R/processData.R @@ -108,12 +108,13 @@ NULL #' Meant to be called before R CMD build. #' @name DataPackageR #' @param arg \code{character} name of the package to build. +#' @param deps \code{logical} should scripts pass data objects to each other (default=TRUE) #' @return logical TRUE if succesful, FALSE, if not. #' @importFrom desc desc #' @importFrom rmarkdown render #' @importFrom utils getSrcref modifyList #' @importFrom usethis proj_set proj_get -DataPackageR <- function(arg = NULL) { +DataPackageR <- function(arg = NULL, deps = TRUE) { requireNamespace("futile.logger") requireNamespace("yaml") pkg_dir <- arg @@ -132,7 +133,7 @@ DataPackageR <- function(arg = NULL) { stop("exiting", call. = FALSE) } } else { - logpath <- + logpath <- normalizePath( file.path(pkg_dir, "inst/extdata"), winslash = "/" @@ -232,20 +233,9 @@ DataPackageR <- function(arg = NULL) { pkg_description <- try(read.description(file = description_file), silent = TRUE ) - if (inherits(pkg_description, "try-error")) { - flog.fatal("No valid DESCRIPTION file") - { - stop( - paste0( - "You need a valid package DESCRIPTION file.", - "Please see Writing R Extensions", - "(http://cran.r-project.org/doc/manuals/", - "r-release/R-exts.html#The-DESCRIPTION-file).\n" - ), - pkg_description - ) - } - } + # The test for a valid DESCRIPTION here is no longer needed since + # we use proj_set(). + # check that we have at least one file # This is caught elsewhere @@ -264,8 +254,9 @@ DataPackageR <- function(arg = NULL) { for (i in seq_along(r_files)) { dataenv <- new.env(hash = TRUE, parent = .GlobalEnv) # assign ENVS into dataenv. - # provide functions in the package to read from it. - assign(x = "ENVS", value = ENVS, dataenv) + # provide functions in the package to read from it (if deps = TRUE) + if(deps) + assign(x = "ENVS", value = ENVS, dataenv) flog.info(paste0( "Processing ", i, " of ", length(r_files), ": ", r_files[i], diff --git a/man/DataPackageR.Rd b/man/DataPackageR.Rd index a865f5a..746246c 100644 --- a/man/DataPackageR.Rd +++ b/man/DataPackageR.Rd @@ -4,10 +4,12 @@ \alias{DataPackageR} \title{Process data generation code in 'data-raw'} \usage{ -DataPackageR(arg = NULL) +DataPackageR(arg = NULL, deps = TRUE) } \arguments{ \item{arg}{\code{character} name of the package to build.} + +\item{deps}{\code{logical} should scripts pass data objects to each other (default=TRUE)} } \value{ logical TRUE if succesful, FALSE, if not. diff --git a/man/package_build.Rd b/man/package_build.Rd index 8d89509..52274cd 100644 --- a/man/package_build.Rd +++ b/man/package_build.Rd @@ -4,7 +4,8 @@ \alias{package_build} \title{Preprocess, document and build a data package} \usage{ -package_build(packageName = NULL, vignettes = FALSE, log = INFO) +package_build(packageName = NULL, vignettes = FALSE, log = INFO, + deps = TRUE) } \arguments{ \item{packageName}{\code{character} path to package source directory. Defaults to the current path when NULL.} @@ -12,6 +13,8 @@ package_build(packageName = NULL, vignettes = FALSE, log = INFO) \item{vignettes}{\code{logical} specify whether to build vignettes. Default FALSE.} \item{log}{log level \code{INFO,WARN,DEBUG,FATAL}} + +\item{deps}{\code{logical} should we pass data objects into subsequent scripts? Default TRUE} } \description{ Combines the preprocessing, documentation, and build steps into one. diff --git a/tests/testthat/test-skeleton.R b/tests/testthat/test-skeleton.R index 8615748..0bdf4a6 100644 --- a/tests/testthat/test-skeleton.R +++ b/tests/testthat/test-skeleton.R @@ -37,7 +37,8 @@ test_that("package can be built from different locations", { file.path(tmp, "subsetCars"))), "subsetCars_1.0.tar.gz") - old <- setwd(file.path(tmp, "subsetCars")) + old <- + setwd(file.path(tmp, "subsetCars")) on.exit(setwd(old)) expect_equal(basename(package_build(".")), "subsetCars_1.0.tar.gz") expect_error(package_build("subsetCars")) @@ -474,6 +475,9 @@ test_that("package built in different edge cases", { force = TRUE, recursive = TRUE) package.skeleton("foo", path = tmp) + suppressWarnings(expect_error( + DataPackageR:::DataPackageR( + file.path(tmp, "foo")))) dir.create(file.path(tmp, "foo", "data-raw")) suppressWarnings(expect_error( DataPackageR:::DataPackageR( @@ -492,8 +496,8 @@ test_that("package built in different edge cases", { recursive = TRUE) - package.skeleton("foo", path = tmp) - expect_error(yml_find(file.path(tmp, "foo"))) + package.skeleton("foo", path = tempdir(),force=TRUE) + expect_error(yml_find(file.path(tempdir(), "foo"))) dir.create(file.path(tmp, "foo", "data-raw")) unlink(file.path(tmp, "foo", "DESCRIPTION")) yml <- DataPackageR:::construct_yml_config("foo.Rmd") diff --git a/vignettes/usingDataPackageR.R b/vignettes/usingDataPackageR.R index ba57f46..8efd4f1 100644 --- a/vignettes/usingDataPackageR.R +++ b/vignettes/usingDataPackageR.R @@ -5,58 +5,104 @@ knitr::opts_chunk$set( eval = TRUE ) -## ------------------------------------------------------------------------ -library(data.tree) +## ----minimal_example, results='hide'------------------------------------- library(DataPackageR) -tmp = normalizePath(tempdir()) -processing_code = system.file("extdata","tests","subsetCars.Rmd",package="DataPackageR") -print(processing_code) -setwd(tmp) -DataPackageR::datapackage.skeleton("Test", - force=TRUE, - code_files = processing_code, - r_object_names = "cars_over_20") # cars_over_20 is an R object - # created in the Rmd file. + +# Let's reproducibly package up +# the cars in the mtcars dataset +# with speed > 20. +# Our dataset will be called cars_over_20. + +# Get the code file that turns the raw data +# to our packaged and processed analysis-ready dataset. +processing_code <- + system.file("extdata", + "tests", + "subsetCars.Rmd", + package = "DataPackageR") + +# Create the package framework. +DataPackageR::datapackage_skeleton( + "mtcars20", + force = TRUE, + code_files = processing_code, + r_object_names = "cars_over_20", + path = tempdir() + ) ## ----dirstructure,echo=FALSE--------------------------------------------- -df = data.frame(pathString=file.path("Test",(list.files(tmp,recursive=TRUE)))) +library(data.tree) +df = data.frame(pathString = file.path( + "mtcars20", + list.files( + file.path(tempdir(), "mtcars20"), + include.dirs = TRUE, + recursive = TRUE + ) + )) as.Node(df) ## ---- echo=FALSE--------------------------------------------------------- -library(yaml) -setwd(tmp) -cat(as.yaml(yaml.load_file("Test/datapackager.yml"))) +cat(yaml::as.yaml(yaml::yaml.load_file(file.path(tempdir(),"mtcars20","datapackager.yml")))) -## ------------------------------------------------------------------------ -# Within the package directory -setwd(tmp) -DataPackageR:::package_build("Test") +## ----eval=TRUE----------------------------------------------------------- +# Run the preprocessing code to build cars_over_20 +# and reproducibly enclose it in a package. +DataPackageR:::package_build(file.path(tempdir(),"mtcars20")) ## ---- echo=FALSE--------------------------------------------------------- -library(yaml) -setwd(tmp) -df = data.frame(pathString=file.path("Test",(list.files("Test",recursive=TRUE)))) -as.Node(df) +df = data.frame(pathString = file.path( + "mtcars20", + list.files( + file.path(tempdir(), "mtcars20"), + include.dirs = TRUE, + recursive = TRUE + ) + )) + as.Node(df) -## ---- echo=FALSE--------------------------------------------------------- -setwd(tmp) -cat(readLines("Test/DATADIGEST"),sep="\n") +## ------------------------------------------------------------------------ +# Let's use the package we just created. +install.packages(file.path(tempdir(),"mtcars20_1.0.tar.gz"), type = "source", repos = NULL) +library(mtcars20) +data("cars_over_20") # load the data +cars_over_20 # Now we can use it. +?cars_over_20 # See the documentation you wrote in data-raw/documentation.R. -## ----echo=FALSE---------------------------------------------------------- -setwd(tmp) -cat(readLines("Test/DESCRIPTION"),sep="\n") +vignettes = vignette(package="mtcars20") +vignettes$results -## ----construct_config---------------------------------------------------- -#assume I have file1.Rmd and file2.R located in /data-raw, and these create 'object1' and 'object2' respectively. +## ------------------------------------------------------------------------ +# We can easily check the version of the data +DataPackageR::data_version("mtcars20") -config = construct_yml_config(code = c("file1.Rmd","file2.R"), data = c("object1","object2")) -cat(as.yaml(config)) +# You can use an assert to check the data version in reports and +# analyses that use the packaged data. +assert_data_version(data_package_name = "mtcars20", + version_string = "0.1.0", + acceptable = "equal") #If this fails, execution stops + #and provides an informative error. -## ------------------------------------------------------------------------ -path_to_package = tempdir() #pretend this is the root of our package -yml_write(config,path = path_to_package) +## ----construct_config, echo=1:2------------------------------------------ +# assume I have file1.Rmd and file2.R located in /data-raw, +# and these create 'object1' and 'object2' respectively. + +config = construct_yml_config(code = c("file1.Rmd", "file2.R"), + data = c("object1", "object2")) +cat(yaml::as.yaml(config)) ## ------------------------------------------------------------------------ +path_to_package = tempdir() #e.g., if tempdir() was the root of our package. +yml_write(config, path = path_to_package) + +## ----echo=1:2------------------------------------------------------------ config = yml_disable_compile(config,filenames = "file2.R") -cat(as.yaml(config)) +yml_write(config, path = path_to_package) # write modified yml to the package. +cat(yaml::as.yaml(config)) + +## ---- echo=FALSE--------------------------------------------------------- +cat(readLines(file.path(tempdir(),"mtcars20","DATADIGEST")),sep="\n") + +## ----echo=FALSE---------------------------------------------------------- +cat(readLines(file.path(tempdir(),"mtcars20","DESCRIPTION")),sep="\n") diff --git a/vignettes/usingDataPackageR.Rmd b/vignettes/usingDataPackageR.Rmd index cf313d5..c810e5c 100644 --- a/vignettes/usingDataPackageR.Rmd +++ b/vignettes/usingDataPackageR.Rmd @@ -290,6 +290,11 @@ A script (e.g., `script2.Rmd`) running after `script1.Rmd` can access a stored d `DataPackageR::datapackager_object_read("script1_dataset")`. +Passing of data objects amongst scripts can be turned off via: + +`package_build(deps = FALSE)` + + # Additional Details We provide some additional details for the interested. diff --git a/vignettes/usingDataPackageR.html b/vignettes/usingDataPackageR.html index da7e101..7e9cdcb 100644 --- a/vignettes/usingDataPackageR.html +++ b/vignettes/usingDataPackageR.html @@ -12,7 +12,7 @@ - + Using DataPackageR @@ -280,97 +280,108 @@

Using DataPackageR

Greg Finak gfinak@fredhutch.org

-

2018-07-02

+

2018-07-05

- -
-

DataPackageR

-

A package to reproducibly process raw data into packaged, analysis-ready data sets.

-

Build Status Coverage status AppVeyor build status DOI

-
-

Code of conduct

-

Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.

-
-
-

Preprint and publication.

-

The publication describing the package is now available at Gates Open Research.

-

The preprint is on biorxiv.

-
-
-

Goals

-

You have raw data that needs to be tidied and otherwise processed into a standardized analytic data set (a data set that is ready for analysis). You want to do the processing using best practices for reproducible research.

-
-

The current state of affairs

-

Normally, you’ll write some code that does the tidying and outputs a tidy data set. If you want to distribute your data set, you can put it in an R package. The preferred mechanism is to place your data tidying code in data-raw in the package source tree and use the devtools package (specifically devtools::use_data) to save the data into the data directory. The build process will include your data set in the final package. You’ll also have to remember to document the data set in roxygen, and write a vignette showing how to use the data. For version control and easy distribution you might post the package on github.

-
-
-

Scaling up

-

The process outlined works well for smaller data sets. It can be a hassle if you have complex data that change frequently (as is often the case in biology, where data trickle in from collaborators and follow-up experiments), or more generally if you have large data sets where raw data can’t be distributed as part of the package source due to size restrictions (e.g. FASTQ files for sequencing, FCS files for flow cytometry, or other “omics” data).

-
-
-

DataPackageR

-

The DataPackageR package simplifies bundling of code, data and documentation into a single R package that can be versioned and distributed. The datapackage.skeleton() API lets you point DataPackageR at your data processing code (in the form of Rmd and / or R files). These are expected to produce data objects to be stored in the final package. The names of these are also passed to datapackage.skeleton(). This produces the necessary package structure, and populations a datapackager.yml configuration file used by the build process.

-

The package_build() API runs the processing code specified in the .yml files and produces html reports of the processing as package vignettes. It also builds boilerplate roxygen documentation of the R objects specified in the .yml, computes checksums of stored R objects and version tags the entire data set collection.

-

If raw data changes, the user can rebuild the data sets in the R package with subsequent calls to package_build() which will re-run the processing, compare the cheksums of new R objects against those currently stored in the package. Any changes force an increment of the Dataversion string in the package DESCRIPTION file. When the package is installed, data sets can be accessed via the standard data() API, package vignettes describing the data processing can be accessed via vignette(), documentation via ?, and the data version via dataVersion(packageName).

-
-
+ -
-

Installation

-

The usual package installation mechanism works:

-
library(devtools)
-devtools::install_github("RGLab/DataPackageR", auth_token=NULL)
+ +
+

Purpose

+

This vignette demonstrates how to use DataPackageR to build a datapackage from the mtcars data set.

-
-

Usage

-

Set up a new data package.

-

We’ll set up a new data package that processes the cars data by subsetting it to include only measurements of stopping distances of cars at speeds greater than 20 mph. It is processed using an Rmd file located in inst/extdata/tests/subsetCars.Rmd that produces a new object called cars_over_20. The package will be called Test. The work will be done in the system /tmp directory.

- -
-

Package skeleton structure

-

This has created a directory, “Test” with the skeleton of a data package.

-

The DESCRIPTION file should be filled out to describe your package. It contains a new DataVersion string, and the revision is automatically incremented if the packaged data changes.

-

Read-and-delete-me has some helpful instructions on how to proceed.

-

The data-raw directory is where the data cleaning code (Rmd) files reside. The contents of this directory are:

-
                   levelName
-1 Test                      
-2  °--Test                  
-3      ¦--DESCRIPTION       
-4      ¦--Read-and-delete-me
-5      ¦--data-raw          
-6      ¦   °--subsetCars.Rmd
-7      °--datapackager.yml  
-

datapackager.yml can be edited as necessary to include additional processing scripts (which should be placed in data-raw), and raw data should be located under under /inst/extdata. It should be copied into that path and the data munging scripts edited to read from there.

+
+

Set up a new data package.

+

We’ll set up a new data package based on mtcars example in the README. The datapackage_skeleton() API is used to set up a new package. The user needs to provide:

+
    +
  • R or Rmd code files that do data processing.
  • +
  • A list of R object names created by those code files.
  • +
+ +
+

What’s in the package skeleton structure?

+

This has created a datapackage source tree named “mtcars2” (in a temporary directory). For a real use case you would pick a path on your filesystem where you could then initialize a new github repository for the package.

+

The contents of mtcars20 are:

+
                levelName
+1  mtcars20              
+2   ¦--DESCRIPTION       
+3   ¦--R                 
+4   ¦--Read-and-delete-me
+5   ¦--data              
+6   ¦--data-raw          
+7   ¦   °--subsetCars.Rmd
+8   ¦--datapackager.yml  
+9   ¦--inst              
+10  ¦   °--extdata       
+11  °--man               
+

You should fill out the DESCRIPTION file to describe your data package. It contains a new DataVersion string that will be automatically incremented when the data package is built if the packaged data has changed.

+

The user-provided code files reside in data-raw. They are executed during the data package build process.

-
-

Yaml configuration

-

Here are the contents of datapackager.yml:

+
+

A few words abou the YAML config file

+

A datapackager.yml file is used to configure and control the build process.

+

The contents are:

configuration:
   files:
     subsetCars.Rmd:
@@ -378,71 +389,90 @@ 

Yaml configuration

enabled: yes objects: cars_over_20 render_root: - tmp: '787709'
-

It includes a files property that has an entry for each script, with the name: and enabled: keys for each file. The objects property lists the data objects produced by the scripts.

-

The render_root property specifies the directory where the Rmd files are rendered. If temporary objects are produced during the processing, they will appear in this directory without polluting the package source tree. A temporary directory is used by default.

+ tmp: '95288' +

The two main pieces of information in the configuration are a list of the files to be processed and the data sets the package will store.

+

This example packages an R data set named cars_over_20 (the name was passed in to datapackage_skeleton()). It is created by the subsetCars.Rmd file.

+

The objects must be listed in the yaml configuration file. datapackage_skeleton() ensures this is done for you automatically.

+

DataPackageR provides an API for modifying this file, so it does not need to be done by hand.

+

Further information on the contents of the YAML configuration file, and the API are in the YAML Configuration Details

-
-

Build your package.

-

Once your scripts are in place and the data objects are documented, you build the package.

-

To run the build process:

-
# Within the package directory
-setwd(tmp)
-DataPackageR:::package_build("Test") 
-INFO [2018-07-02 12:26:31] Logging to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/inst/extdata/Logfiles/processing.log
-INFO [2018-07-02 12:26:31] Processing data
-INFO [2018-07-02 12:26:31] Reading yaml configuration
-INFO [2018-07-02 12:26:31] Found /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/data-raw/subsetCars.Rmd
-INFO [2018-07-02 12:26:31] Processing 1 of 1: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/data-raw/subsetCars.Rmd
-processing file: subsetCars.Rmd
-output file: subsetCars.knit.md
-/usr/local/bin/pandoc +RTS -K512m -RTS subsetCars.utf8.md --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash+smart --output /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/inst/extdata/Logfiles/subsetCars.html --email-obfuscation none --self-contained --standalone --section-divs --template /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp7wPrTh/rmarkdown-strcd9679f83bd2.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' 
-
-Output created: Test/inst/extdata/Logfiles/subsetCars.html
-INFO [2018-07-02 12:26:31] 1 required data objects created by subsetCars.Rmd
-INFO [2018-07-02 12:26:31] Saving to data
-INFO [2018-07-02 12:26:32] Copied documentation to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/R/Test.R
-✔ Creating 'vignettes/'
-✔ Creating 'inst/doc/'
-INFO [2018-07-02 12:26:32] Done
-INFO [2018-07-02 12:26:32] DataPackageR succeeded
-INFO [2018-07-02 12:26:32] Building documentation
-First time using roxygen2. Upgrading automatically...
-Updating roxygen version in /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/DESCRIPTION
-Writing NAMESPACE
-Writing Test.Rd
-Writing cars_over_20.Rd
-INFO [2018-07-02 12:26:32] Building package
-'/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
-  --no-environ --no-save --no-restore --quiet CMD build  \
-  '/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test'  \
-  --no-resave-data --no-manual --no-build-vignettes 
-
-[1] "/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test_1.0.tar.gz"
+
+

Where do I put raw data?

+

Raw data (provided the size is not prohibitive) can be placed in inst/extdata.

+

In this example we are reading from data(mtcars) rather than from the file system.

+
+

An API to locate data sets within an R or Rmd file.

+

To locate the data to read from the filesystem:

+
    +
  • DataPackageR::project_extdata_path() to get the path to inst/extdata from inside an Rmd or R file. (e.g., /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/mtcars20/inst/extdata)

  • +
  • DataPackageR::project_path() to get the path to the datapackage root. (e.g., /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/mtcars20)

  • +
+

Raw data stored externally can be retreived relative to these paths.

-
-

Logging the build process

-

DataPackageR uses the futile.logger pagckage to log progress. If there are errors in the processing, the script will notify you via logging to console and to /private/tmp/Test/inst/extdata/Logfiles/processing.log. Errors should be corrected and the build repeated.

-

If everything goes smoothly, you will have a new package built in the parent directory. In this case we have a new package Test_1.0.tar.gz. When the package is installed, it will contain a vignette subsetCars that can be loaded using the vignette() API. The vignette will detail the processing performed by the subsetCars.Rmd processing script.

-
-

The package source directory after building

+
+
+

Build the data package.

+

Once the skeleton framework is set up,

+
# Run the preprocessing code to build cars_over_20
+# and reproducibly enclose it in a package.
+DataPackageR:::package_build(file.path(tempdir(),"mtcars20"))
+INFO [2018-07-05 11:41:30] Logging to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/processing.log
+INFO [2018-07-05 11:41:30] Processing data
+INFO [2018-07-05 11:41:30] Reading yaml configuration
+INFO [2018-07-05 11:41:30] Found /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/data-raw/subsetCars.Rmd
+INFO [2018-07-05 11:41:30] Processing 1 of 1: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/data-raw/subsetCars.Rmd
+processing file: subsetCars.Rmd
+output file: subsetCars.knit.md
+/usr/local/bin/pandoc +RTS -K512m -RTS subsetCars.utf8.md --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash+smart --output /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/subsetCars.html --email-obfuscation none --self-contained --standalone --section-divs --template /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/rmarkdown-str1c6861088f2d.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' 
+
+Output created: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/subsetCars.html
+INFO [2018-07-05 11:41:30] 1 required data objects created by subsetCars.Rmd
+INFO [2018-07-05 11:41:30] Saving to data
+INFO [2018-07-05 11:41:30] Copied documentation to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/R/mtcars20.R
+✔ Creating 'vignettes/'
+✔ Creating 'inst/doc/'
+INFO [2018-07-05 11:41:30] Done
+INFO [2018-07-05 11:41:30] DataPackageR succeeded
+INFO [2018-07-05 11:41:30] Building documentation
+First time using roxygen2. Upgrading automatically...
+Updating roxygen version in /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/DESCRIPTION
+Writing NAMESPACE
+Writing mtcars20.Rd
+Writing cars_over_20.Rd
+INFO [2018-07-05 11:41:30] Building package
+'/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
+  --no-environ --no-save --no-restore --quiet CMD build  \
+  '/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20'  \
+  --no-resave-data --no-manual --no-build-vignettes 
+
+[1] "/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20_1.0.tar.gz"
+
+

A log of the build process

+

DataPackageR uses the futile.logger pagckage to log progress.

+

If there are errors in the processing, the script will notify you via logging to console and to /private/tmp/Test/inst/extdata/Logfiles/processing.log. Errors should be corrected and the build repeated.

+

If everything goes smoothly, you will have a new package built in the parent directory.

+

In this case we have a new package mtcars20_1.0.tar.gz.

+
+
+

A note about the package source directory after building.

+

The pacakge source directory changes after the first build.

                         levelName
-1  Test                           
+1  mtcars20                       
 2   ¦--DATADIGEST                 
 3   ¦--DESCRIPTION                
 4   ¦--NAMESPACE                  
 5   ¦--R                          
-6   ¦   °--Test.R                 
+6   ¦   °--mtcars20.R             
 7   ¦--Read-and-delete-me         
-8   ¦--data-raw                   
-9   ¦   ¦--documentation.R        
-10  ¦   ¦--subsetCars.R           
-11  ¦   ¦--subsetCars.Rmd         
-12  ¦   ¦--subsetCars.knit.md     
-13  ¦   °--subsetCars.utf8.md     
-14  ¦--data                       
-15  ¦   °--cars_over_20.rda       
+8   ¦--data                       
+9   ¦   °--cars_over_20.rda       
+10  ¦--data-raw                   
+11  ¦   ¦--documentation.R        
+12  ¦   ¦--subsetCars.R           
+13  ¦   ¦--subsetCars.Rmd         
+14  ¦   ¦--subsetCars.knit.md     
+15  ¦   °--subsetCars.utf8.md     
 16  ¦--datapackager.yml           
 17  ¦--inst                       
 18  ¦   ¦--doc                    
@@ -453,101 +483,181 @@ 

The package source directory after building

23 ¦ ¦--processing.log 24 ¦ °--subsetCars.html 25 ¦--man -26 ¦ ¦--Test.Rd -27 ¦ °--cars_over_20.Rd +26 ¦ ¦--cars_over_20.Rd +27 ¦ °--mtcars20.Rd 28 °--vignettes 29 °--subsetCars.Rmd
-
-

Details

-

A number of things have changed. The subsetCars processing script now appears under /vignettes and inst/doc as a processed html report so that it will be available to view via vignette() once the package is installed. inst/extdata/Logfiles contains a log file of the entire build process as well as intermediate files created while parsing the R / Rmd code. Documentation Rd files appear in /man, these should be edite to provide further details on the data objects in the package. The data objects are stored under /data where we see cars_over_20.rda, the object we initially specified in datapackager.yml.

+
+

Update the autogenerated documentation.

+

After the first build, the R directory contains mtcars.R that has autogenerated roxygen2 markup documentation for the data package and for the packaged data cars_over20.

+

The processed Rd files can be found in man.

+
+
+

Dont’ forget to rebuild the package.

+

You should update the documentation in R/mtcars.R, then call package_build() again.

+
-
-

Versioning data objects

-

The DataPackageR package calculates an md5 checksum of each data object it stores, and keeps track of them in a file called DATADIGEST.

+
+

Installing and using the new data package

+
+

Accessing vignettes, data sets, and data set documentation.

+

The package source also contains files in the vignettes and inst/doc directories that provide a log of the data processing.

+

When the package is installed, these will be accessible via the vignette() API.

+

The vignette will detail the processing performed by the subsetCars.Rmd processing script.

+

The data set documentation will be accessible via ?cars_over_20, and the data sets via data().

+ +
+ +
+
+

Next steps

+

You should place the data package source directory under git version control. This allows you to version control your data processing code.

+
+
+

Partial builds and migrating old data packages.

+

Version 1.12.0 has moved away from controlling the build process using datasets.R and an additional masterfile argument.

+

The build process is now controlled via a datapackager.yml configuration file located in the package root directory. (see YAML Configuration Details)

+

You can migrate an old package by constructing such a config file using the construct_yml_config() API.

+ +

config is a newly constructed yaml configuration object. It can be written to the package directory:

+ +

Now the package at path_to_package will build with version 1.12.0 or greater.

+
+

Partial builds

+

We can also perform partial builds of a subset of files in a package by toggling the enabled key in the config file.

+

This can be done with the following API:

+ +

Note that the modified configuration needs to be written back to the package source directory in order for the changes to take effect.

+

The consequence of toggling a file to enable: no is that it will be skipped when the package is rebuilt, but the data will still be retained in the package, and the documentation will not be altered.

+

This is useful in situations where we have multiple data sets, and want to re-run one script to update a specific data set, but not the other scripts because they may be too time consuming, for example.

+
+
+
+

Multi-script pipelines.

+

We may have situations where we have mutli-script pipelines. There are two ways to share data among scripts.

+
    +
  1. filesystem artifacts
  2. +
  3. data objects passed to subsequent scripts.
  4. +
+
+

File system artifacts

+

The yaml configuration property render_root specifies the working directory where scripts will be rendered.

+

If a script writes files to the working directory, that is where files will appear. These can be read by subsequent scripts.

+
+
+

Passing data objects to subsequent scripts.

+

A script (e.g., script2.Rmd) running after script1.Rmd can access a stored data object named script1_dataset created by script1.Rmd by calling

+

DataPackageR::datapackager_object_read("script1_dataset").

+
+
+
+

Additional Details

+

We provide some additional details for the interested.

+
+

Fingerprints of stored data objects

+

DataPackageR calculates an md5 checksum of each data object it stores, and keeps track of them in a file called DATADIGEST.

  • Each time the package is rebuilt, the md5 sums of the new data objects are compared against the DATADIGEST.
  • If they don’t match, the build process checks that the DataVersion string has been incremented in the DESCRIPTION file.
  • If it has not the build process will exit and produce an error message.
-
-

DATADIGEST

+
+

DATADIGEST

The DATADIGEST file contains the following:

DataVersion: 0.1.0
 cars_over_20: 3ccb5b0aaa74fe7cfc0d3ca6ab0b5cf3
-
-

DESCRIPTION

+
+

DESCRIPTION

The description file has the new DataVersion string.

-
Package: Test
+
Package: mtcars20
 Type: Package
 Title: What the package does (short line)
 Version: 1.0
-Date: 2018-07-02
+Date: 2018-07-05
 Author: Who wrote it
 Maintainer: Who to complain to <yourfault@somewhere.net>
 Description: More about what it does (maybe more than one line)
 License: What license is it under?
 DataVersion: 0.1.0
+Suggests: 
+    knitr,
+    rmarkdown
+VignetteBuilder: knitr
 RoxygenNote: 6.0.1
-
-

Next steps

-

Your downstream data analysis can depend on a specific version of your data package (for example by tesing the packageVersion() string);

-
if(DataPackageR::packageVersion("MyNewStudy") != "1.0.0")
-  stop("The expected version of MyNewStudy is 1.0.0, but ",packageVersion("MyNewStudy")," is installed! Analysis results may differ!")
-

The DataPackageR packge also provides datasetVersion() to extract the data set version information.

-

You should also place the data package source directory under git version control. This allows you to version control your data processing code.

-
-

Why not use R CMD build?

+
+

Why not use R CMD build?

If the processing script is time consuming or the data set is particularly large, then R CMD build would run the code each time the package is installed. In such cases, raw data may not be available, or the environment to do the data processing may not be set up for each user of the data. In such cases, DataPackageR provides a mechanism to decouple data processing from package building/installation for downstream users of the data.

-
-

Partial builds and migrating old data packages.

-

Version 1.12.0 has moved away from controlling the build process using datasets.R and an additional masterfile argument. The build process is now controlled via a datapackager.yml configuration file located in the package root directory.

-

You can migrate an old package by constructing such a config file using the construct_yml_config() API.

- -

config is a newly constructed yaml configuration object. It can be written to the package directory:

- -

Now the package at path_to_package will build with version 1.12.0 or greater.

-

We can also perform partial builds of a subset of files in a package by toggling the enabled key in the config file. This can be done with the following API:

- -

Where config is a configuration read from a data package root directory. The config object needs to be written back to the package root in order for the changes to take effect. The consequence of toggling a file to enable: no is that it will be skipped when the package is built, but the data will be retained, and the documentation will not be altered.

-
-
diff --git a/vignettes/usingDataPackageR.md b/vignettes/usingDataPackageR.md index 8494aab..96f09d4 100644 --- a/vignettes/usingDataPackageR.md +++ b/vignettes/usingDataPackageR.md @@ -1,12 +1,13 @@ --- title: "Using DataPackageR" author: "Greg Finak " -date: "2018-07-02" +date: "2018-07-05" output: rmarkdown::html_vignette: keep_md: TRUE + toc: yes vignette: > - %\VignetteIndexEntry{A quick guide to using DataPackageR} + %\VignetteIndexEntry{A Guide to using DataPackageR} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} \usepackage{graphicx} @@ -14,87 +15,46 @@ vignette: > -# DataPackageR +## Purpose -A package to reproducibly process raw data into packaged, analysis-ready data sets. +This vignette demonstrates how to use DataPackageR to build a datapackage from the `mtcars` data set. - [![Build Status](https://travis-ci.org/RGLab/DataPackageR.svg?branch=master)](https://travis-ci.org/RGLab/DataPackageR) - [![Coverage status](https://codecov.io/gh/RGLab/DataPackageR/branch/master/graph/badge.svg)](https://codecov.io/github/RGLab/DataPackageR?branch=master) - [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/RGLab/DataPackageR?branch=master&svg=true)](https://ci.appveyor.com/project/RGLab/DataPackageR) -[![DOI](https://zenodo.org/badge/29267435.svg)](https://doi.org/10.5281/zenodo.1292095) +## Set up a new data package. -## Code of conduct +We'll set up a new data package based on `mtcars` example in the [README](https://github.com/RGLab/DataPackageR/blob/master/README.md). +The `datapackage_skeleton()` API is used to set up a new package. +The user needs to provide: -Please note that this project is released with a [Contributor Code of Conduct](CODE_OF_CONDUCT.md). - By participating in this project you agree to abide by its terms. +- R or Rmd code files that do data processing. +- A list of R object names created by those code files. -## Preprint and publication. -The publication describing the package is now available at [Gates Open Research](https://gatesopenresearch.org/articles/2-31/v1). - -The preprint is on [biorxiv](https://doi.org/10.1101/342907). - -## Goals - -You have raw data that needs to be tidied and otherwise processed into a standardized analytic data set (a data set that is ready for analysis). -You want to do the processing using best practices for reproducible research. - -### The current state of affairs - -Normally, you'll write some code that does the tidying and outputs a tidy data set. -If you want to distribute your data set, you can put it in an R package. -The preferred mechanism is to place your data tidying code in `data-raw` in the package source tree and use the `devtools` package (specifically `devtools::use_data`) to save the data into the `data` directory. The build process will include your data set in the final package. -You'll also have to remember to document the data set in `roxygen`, and write a vignette showing how to use the data. -For version control and easy distribution you might post the package on github. - -### Scaling up - -The process outlined works well for smaller data sets. -It can be a hassle if you have complex data that change frequently (as is often the case in biology, where data trickle in from collaborators and follow-up experiments), or more generally if you have large data sets where raw data can't be distributed as part of the package source due to size restrictions (e.g. FASTQ files for sequencing, FCS files for flow cytometry, or other "omics" data). - -### DataPackageR - -The `DataPackageR` package simplifies bundling of code, data and documentation into a single R package that can be versioned and distributed. -The `datapackage.skeleton()` API lets you point `DataPackageR` at your data processing code (in the form of Rmd and / or R files). These are expected to produce `data objects` to be stored in the final package. The names of these are also passed to `datapackage.skeleton()`. This produces the necessary package structure, and populations a `datapackager.yml` configuration file used by the build process. - -The `package_build()` API runs the processing code specified in the `.yml` files and produces html reports of the processing as **package vignettes**. It also builds boilerplate `roxygen` documentation of the R objects specified in the `.yml`, computes checksums of stored R objects and version tags the entire data set collection. - -If raw data changes, the user can rebuild the data sets in the R package with subsequent calls to `package_build()` which will re-run the processing, compare the cheksums of new R objects against those currently stored in the package. -Any changes force an increment of the `Dataversion` string in the package DESCRIPTION file. -When the package is installed, data sets can be accessed via the standard `data()` API, package vignettes describing the data processing can be accessed via `vignette()`, documentation via `?`, and the data version via `dataVersion(packageName)`. - - -# Installation - -The usual package installation mechanism works: - -``` -library(devtools) -devtools::install_github("RGLab/DataPackageR", auth_token=NULL) -``` - -# Usage - -Set up a new data package. - -We'll set up a new data package that processes the `cars` data by subsetting it to include only measurements of stopping distances of cars at speeds greater than 20 mph. It is processed using an Rmd file located in `inst/extdata/tests/subsetCars.Rmd` that produces a new object called `cars_over_20`. The package will be called `Test`. The work will be done in the system `/tmp` directory. ```r -library(data.tree) library(DataPackageR) -tmp = normalizePath(tempdir()) -processing_code = system.file("extdata","tests","subsetCars.Rmd",package="DataPackageR") -print(processing_code) -[1] "/Users/gfinak/Documents/Projects/DataPackageR/inst/extdata/tests/subsetCars.Rmd" -setwd(tmp) -DataPackageR::datapackage.skeleton("Test", - force=TRUE, - code_files = processing_code, - r_object_names = "cars_over_20") # cars_over_20 is an R object -Warning in DataPackageR::datapackage.skeleton("Test", force = TRUE, -code_files = processing_code, : Please use datapackage_skeleton() instead -of datapackage.skeleton() + +# Let's reproducibly package up +# the cars in the mtcars dataset +# with speed > 20. +# Our dataset will be called cars_over_20. + +# Get the code file that turns the raw data +# to our packaged and processed analysis-ready dataset. +processing_code <- + system.file("extdata", + "tests", + "subsetCars.Rmd", + package = "DataPackageR") + +# Create the package framework. +DataPackageR::datapackage_skeleton( + "mtcars20", + force = TRUE, + code_files = processing_code, + r_object_names = "cars_over_20", + path = tempdir() + ) Creating directories ... Creating DESCRIPTION ... Creating NAMESPACE ... @@ -102,42 +62,45 @@ Creating Read-and-delete-me ... Saving functions and data ... Making help files ... Done. -Further steps are described in './Test/Read-and-delete-me'. +Further steps are described in '/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/mtcars20/Read-and-delete-me'. Adding DataVersion string to DESCRIPTION Creating data and data-raw directories configuring yaml file - # created in the Rmd file. ``` -### Package skeleton structure - -This has created a directory, "Test" with the skeleton of a data package. +### What's in the package skeleton structure? -The `DESCRIPTION` file should be filled out to describe your package. It contains a new `DataVersion` string, and the -revision is automatically incremented if the packaged data changes. +This has created a datapackage source tree named "mtcars2" (in a temporary directory). +For a real use case you would pick a `path` on your filesystem where you could then initialize a new github repository for the package. -`Read-and-delete-me` has some helpful instructions on how to proceed. - -The `data-raw` directory is where the data cleaning code (`Rmd`) files reside. -The contents of this directory are: +The contents of `mtcars20` are: ``` - levelName -1 Test -2 °--Test -3 ¦--DESCRIPTION -4 ¦--Read-and-delete-me -5 ¦--data-raw -6 ¦ °--subsetCars.Rmd -7 °--datapackager.yml + levelName +1 mtcars20 +2 ¦--DESCRIPTION +3 ¦--R +4 ¦--Read-and-delete-me +5 ¦--data +6 ¦--data-raw +7 ¦ °--subsetCars.Rmd +8 ¦--datapackager.yml +9 ¦--inst +10 ¦ °--extdata +11 °--man ``` -`datapackager.yml` can be edited as necessary to include additional processing scripts (which should be placed in `data-raw`), and raw data should be located under under `/inst/extdata`. It should be copied into that path and the data munging scripts edited to read from there. +You should fill out the `DESCRIPTION` file to describe your data package. +It contains a new `DataVersion` string that will be automatically incremented when the data package is built *if the packaged data has changed*. + +The user-provided code files reside in `data-raw`. They are executed during the data package build process. -### Yaml configuration +### A few words abou the YAML config file -Here are the contents of `datapackager.yml`: +A `datapackager.yml` file is used to configure and control the build process. + +The contents are: ``` @@ -148,83 +111,113 @@ configuration: enabled: yes objects: cars_over_20 render_root: - tmp: '787709' + tmp: '95288' ``` -It includes a `files` property that has an entry for each script, with the `name:` and `enabled:` keys for each file. The `objects` property lists the data objects produced by the scripts. +The two main pieces of information in the configuration are a list of the files to be processed and the data sets the package will store. + +This example packages an R data set named `cars_over_20` (the name was passed in to `datapackage_skeleton()`). +It is created by the `subsetCars.Rmd` file. + + +The objects must be listed in the yaml configuration file. `datapackage_skeleton()` ensures this is done for you automatically. + +DataPackageR provides an API for modifying this file, so it does not need to be done by hand. + +Further information on the contents of the YAML configuration file, and the API are in the [YAML Configuration Details](https://github.com/RGLab/DataPackageR/blob/master/YAML_CONFIG.md) + +### Where do I put raw data? -The `render_root` property specifies the directory where the Rmd files are rendered. If temporary objects are produced during the processing, they will appear in this directory without polluting the package source tree. A temporary directory is used by default. +Raw data (provided the size is not prohibitive) can be placed in `inst/extdata`. -### Build your package. +In this example we are reading from `data(mtcars)` rather than from the file system. -Once your scripts are in place and the data objects are documented, you build the package. - -To run the build process: +#### An API to locate data sets within an R or Rmd file. + +To locate the data to read from the filesystem: + +- `DataPackageR::project_extdata_path()` to get the path to `inst/extdata` from inside an `Rmd` or `R` file. (e.g., /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/mtcars20/inst/extdata) + +- `DataPackageR::project_path()` to get the path to the datapackage root. (e.g., /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/mtcars20) + +Raw data stored externally can be retreived relative to these paths. + + +## Build the data package. + +Once the skeleton framework is set up, ```r -# Within the package directory -setwd(tmp) -DataPackageR:::package_build("Test") -INFO [2018-07-02 12:26:31] Logging to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/inst/extdata/Logfiles/processing.log -INFO [2018-07-02 12:26:31] Processing data -INFO [2018-07-02 12:26:31] Reading yaml configuration -INFO [2018-07-02 12:26:31] Found /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/data-raw/subsetCars.Rmd -INFO [2018-07-02 12:26:31] Processing 1 of 1: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/data-raw/subsetCars.Rmd +# Run the preprocessing code to build cars_over_20 +# and reproducibly enclose it in a package. +DataPackageR:::package_build(file.path(tempdir(),"mtcars20")) +INFO [2018-07-05 11:41:30] Logging to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/processing.log +INFO [2018-07-05 11:41:30] Processing data +INFO [2018-07-05 11:41:30] Reading yaml configuration +INFO [2018-07-05 11:41:30] Found /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/data-raw/subsetCars.Rmd +INFO [2018-07-05 11:41:30] Processing 1 of 1: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/data-raw/subsetCars.Rmd processing file: subsetCars.Rmd output file: subsetCars.knit.md -/usr/local/bin/pandoc +RTS -K512m -RTS subsetCars.utf8.md --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash+smart --output /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/inst/extdata/Logfiles/subsetCars.html --email-obfuscation none --self-contained --standalone --section-divs --template /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp7wPrTh/rmarkdown-strcd9679f83bd2.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' +/usr/local/bin/pandoc +RTS -K512m -RTS subsetCars.utf8.md --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash+smart --output /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/subsetCars.html --email-obfuscation none --self-contained --standalone --section-divs --template /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T//Rtmp3EWJ9k/rmarkdown-str1c6861088f2d.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' -Output created: Test/inst/extdata/Logfiles/subsetCars.html -INFO [2018-07-02 12:26:31] 1 required data objects created by subsetCars.Rmd -INFO [2018-07-02 12:26:31] Saving to data -INFO [2018-07-02 12:26:32] Copied documentation to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/R/Test.R +Output created: /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/inst/extdata/Logfiles/subsetCars.html +INFO [2018-07-05 11:41:30] 1 required data objects created by subsetCars.Rmd +INFO [2018-07-05 11:41:30] Saving to data +INFO [2018-07-05 11:41:30] Copied documentation to /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/R/mtcars20.R ✔ Creating 'vignettes/' ✔ Creating 'inst/doc/' -INFO [2018-07-02 12:26:32] Done -INFO [2018-07-02 12:26:32] DataPackageR succeeded -INFO [2018-07-02 12:26:32] Building documentation +INFO [2018-07-05 11:41:30] Done +INFO [2018-07-05 11:41:30] DataPackageR succeeded +INFO [2018-07-05 11:41:30] Building documentation First time using roxygen2. Upgrading automatically... -Updating roxygen version in /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test/DESCRIPTION +Updating roxygen version in /private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20/DESCRIPTION Writing NAMESPACE -Writing Test.Rd +Writing mtcars20.Rd Writing cars_over_20.Rd -INFO [2018-07-02 12:26:32] Building package +INFO [2018-07-05 11:41:30] Building package '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \ --no-environ --no-save --no-restore --quiet CMD build \ - '/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test' \ + '/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20' \ --no-resave-data --no-manual --no-build-vignettes -[1] "/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp7wPrTh/Test_1.0.tar.gz" +[1] "/private/var/folders/jh/x0h3v3pd4dd497g3gtzsm8500000gn/T/Rtmp3EWJ9k/mtcars20_1.0.tar.gz" ``` -### Logging the build process +### A log of the build process + +DataPackageR uses the `futile.logger` pagckage to log progress. -DataPackageR uses the `futile.logger` pagckage to log progress. If there are errors in the processing, the script will notify you via logging to console and to `/private/tmp/Test/inst/extdata/Logfiles/processing.log`. Errors should be corrected and the build repeated. +If there are errors in the processing, the script will notify you via logging to console and to `/private/tmp/Test/inst/extdata/Logfiles/processing.log`. Errors should be corrected and the build repeated. -If everything goes smoothly, you will have a new package built in the parent directory. In this case we have a new package -`Test_1.0.tar.gz`. When the package is installed, it will contain a vignette `subsetCars` that can be loaded using the `vignette()` API. The vignette will detail the processing performed by the `subsetCars.Rmd` processing script. +If everything goes smoothly, you will have a new package built in the parent directory. -### The package source directory after building +In this case we have a new package +`mtcars20_1.0.tar.gz`. + + +### A note about the package source directory after building. + +The pacakge source directory changes after the first build. ``` levelName -1 Test +1 mtcars20 2 ¦--DATADIGEST 3 ¦--DESCRIPTION 4 ¦--NAMESPACE 5 ¦--R -6 ¦ °--Test.R +6 ¦ °--mtcars20.R 7 ¦--Read-and-delete-me -8 ¦--data-raw -9 ¦ ¦--documentation.R -10 ¦ ¦--subsetCars.R -11 ¦ ¦--subsetCars.Rmd -12 ¦ ¦--subsetCars.knit.md -13 ¦ °--subsetCars.utf8.md -14 ¦--data -15 ¦ °--cars_over_20.rda +8 ¦--data +9 ¦ °--cars_over_20.rda +10 ¦--data-raw +11 ¦ ¦--documentation.R +12 ¦ ¦--subsetCars.R +13 ¦ ¦--subsetCars.Rmd +14 ¦ ¦--subsetCars.knit.md +15 ¦ °--subsetCars.utf8.md 16 ¦--datapackager.yml 17 ¦--inst 18 ¦ ¦--doc @@ -235,89 +228,101 @@ If everything goes smoothly, you will have a new package built in the parent dir 23 ¦ ¦--processing.log 24 ¦ °--subsetCars.html 25 ¦--man -26 ¦ ¦--Test.Rd -27 ¦ °--cars_over_20.Rd +26 ¦ ¦--cars_over_20.Rd +27 ¦ °--mtcars20.Rd 28 °--vignettes 29 °--subsetCars.Rmd ``` -#### Details +#### Update the autogenerated documentation. -A number of things have changed. The subsetCars processing script now appears under `/vignettes` and `inst/doc` as a processed html report so that it will be available to view via `vignette()` once the package is installed. -`inst/extdata/Logfiles` contains a log file of the entire build process as well as intermediate files created while parsing the R / Rmd code. Documentation Rd files appear in `/man`, these should be edite to provide further details on the data objects in the package. The data objects are stored under `/data` where we see `cars_over_20.rda`, the object we initially specified in `datapackager.yml`. +After the first build, the `R` directory contains `mtcars.R` that has autogenerated `roxygen2` markup documentation for the data package and for the packaged data `cars_over20`. +The processed `Rd` files can be found in `man`. -## Versioning data objects +#### Dont' forget to rebuild the package. -The DataPackageR package calculates an md5 checksum of each data object it stores, and keeps track of them in a file -called `DATADIGEST`. +You should update the documentation in `R/mtcars.R`, then call `package_build()` again. -- Each time the package is rebuilt, the md5 sums of the new data objects are compared against the DATADIGEST. -- If they don't match, the build process checks that the `DataVersion` string has been incremented in the `DESCRIPTION` file. -- If it has not the build process will exit and produce an error message. -### DATADIGEST +## Installing and using the new data package -The `DATADIGEST` file contains the following: +### Accessing vignettes, data sets, and data set documentation. +The package source also contains files in the `vignettes` and `inst/doc` directories that provide a log of the data processing. -``` -DataVersion: 0.1.0 -cars_over_20: 3ccb5b0aaa74fe7cfc0d3ca6ab0b5cf3 -``` +When the package is installed, these will be accessible via the `vignette()` API. +The vignette will detail the processing performed by the `subsetCars.Rmd` processing script. -### DESCRIPTION - -The description file has the new `DataVersion` string. +The data set documentation will be accessible via `?cars_over_20`, and the data sets via `data()`. -``` -Package: Test -Type: Package -Title: What the package does (short line) -Version: 1.0 -Date: 2018-07-02 -Author: Who wrote it -Maintainer: Who to complain to -Description: More about what it does (maybe more than one line) -License: What license is it under? -DataVersion: 0.1.0 -RoxygenNote: 6.0.1 +```r +# Let's use the package we just created. +install.packages(file.path(tempdir(),"mtcars20_1.0.tar.gz"), type = "source", repos = NULL) +library(mtcars20) +data("cars_over_20") # load the data +cars_over_20 # Now we can use it. + speed dist +44 22 66 +45 23 54 +46 24 70 +47 24 92 +48 24 93 +49 24 120 +50 25 85 +?cars_over_20 # See the documentation you wrote in data-raw/documentation.R. + +vignettes = vignette(package="mtcars20") +vignettes$results + Package +Topic "mtcars20" + LibPath +Topic "/Library/Frameworks/R.framework/Versions/3.5/Resources/library" + Item Title +Topic "subsetCars" "A Test Document for DataPackageR (source, html)" ``` -### Next steps -Your downstream data analysis can depend on a specific version of your data package (for example by tesing the `packageVersion()` string); +### Using the DataVersion -```r{} -if(DataPackageR::packageVersion("MyNewStudy") != "1.0.0") - stop("The expected version of MyNewStudy is 1.0.0, but ",packageVersion("MyNewStudy")," is installed! Analysis results may differ!") -``` +Your downstream data analysis can depend on a specific version of the data in your data package by tesing the DataVersion string in the DESCRIPTION file. -The DataPackageR packge also provides `datasetVersion()` to extract the data set version information. +We provide an API for this: -You should also place the data package source directory under `git` version control. -This allows you to version control your data processing code. -### Why not use R CMD build? +```r +# We can easily check the version of the data +DataPackageR::data_version("mtcars20") +[1] '0.1.0' + +# You can use an assert to check the data version in reports and +# analyses that use the packaged data. +assert_data_version(data_package_name = "mtcars20", + version_string = "0.1.0", + acceptable = "equal") #If this fails, execution stops + #and provides an informative error. +``` -If the processing script is time consuming or the data set is particularly large, then `R CMD build` would run the code each time the package is installed. In such cases, raw data may not be available, or the environment to do the data processing may not be set up for each user of the data. In such cases, DataPackageR provides a mechanism to decouple data processing from package building/installation for downstream users of the data. +# Next steps + +You should place the data package source directory under `git` version control. +This allows you to version control your data processing code. +# Partial builds and migrating old data packages. -## Partial builds and migrating old data packages. +Version 1.12.0 has moved away from controlling the build process using `datasets.R` and an additional `masterfile` argument. -Version 1.12.0 has moved away from controlling the build process using `datasets.R` and an additional `masterfile` argument. The build process is now controlled via a `datapackager.yml` configuration file located in the package root directory. +The build process is now controlled via a `datapackager.yml` configuration file located in the package root directory. (see [YAML Configuration Details](https://github.com/RGLab/DataPackageR/blob/master/YAML_CONFIG.md)) You can migrate an old package by constructing such a config file using the `construct_yml_config()` API. ```r -#assume I have file1.Rmd and file2.R located in /data-raw, and these create 'object1' and 'object2' respectively. - -config = construct_yml_config(code = c("file1.Rmd","file2.R"), data = c("object1","object2")) -cat(as.yaml(config)) +# assume I have file1.Rmd and file2.R located in /data-raw, +# and these create 'object1' and 'object2' respectively. configuration: files: file1.Rmd: @@ -330,25 +335,29 @@ configuration: - object1 - object2 render_root: - tmp: '127771' + tmp: '288022' ``` `config` is a newly constructed yaml configuration object. It can be written to the package directory: ```r -path_to_package = tempdir() #pretend this is the root of our package -yml_write(config,path = path_to_package) +path_to_package = tempdir() #e.g., if tempdir() was the root of our package. +yml_write(config, path = path_to_package) ``` Now the package at `path_to_package` will build with version 1.12.0 or greater. -We can also perform partial builds of a subset of files in a package by toggling the `enabled` key in the config file. This can be done with the following API: +## Partial builds + +We can also perform partial builds of a subset of files in a package by toggling the `enabled` key in the config file. + +This can be done with the following API: ```r config = yml_disable_compile(config,filenames = "file2.R") -cat(as.yaml(config)) +yml_write(config, path = path_to_package) # write modified yml to the package. configuration: files: file1.Rmd: @@ -361,11 +370,88 @@ configuration: - object1 - object2 render_root: - tmp: '127771' + tmp: '288022' ``` -Where `config` is a configuration read from a data package root directory. The `config` object needs to be written back to the package root in order for the changes to take effect. The consequence of toggling a file to `enable: no` is that it will be skipped when the package is built, but the data will be retained, and the documentation will not be altered. +Note that the modified configuration needs to be written back to the package source directory in order for the +changes to take effect. + +The consequence of toggling a file to `enable: no` is that it will be skipped when the package is rebuilt, +but the data will still be retained in the package, and the documentation will not be altered. + +This is useful in situations where we have multiple data sets, and want to re-run one script to update a specific data set, but +not the other scripts because they may be too time consuming, for example. + +# Multi-script pipelines. + +We may have situations where we have mutli-script pipelines. There are two ways to share data among scripts. +1. filesystem artifacts +2. data objects passed to subsequent scripts. + +### File system artifacts + +The yaml configuration property `render_root` specifies the working directory where scripts will be rendered. + +If a script writes files to the working directory, that is where files will appear. These can be read by subsequent scripts. + +### Passing data objects to subsequent scripts. + +A script (e.g., `script2.Rmd`) running after `script1.Rmd` can access a stored data object named `script1_dataset` created by `script1.Rmd` by calling + +`DataPackageR::datapackager_object_read("script1_dataset")`. + +# Additional Details + +We provide some additional details for the interested. + +### Fingerprints of stored data objects + +DataPackageR calculates an md5 checksum of each data object it stores, and keeps track of them in a file +called `DATADIGEST`. + +- Each time the package is rebuilt, the md5 sums of the new data objects are compared against the DATADIGEST. +- If they don't match, the build process checks that the `DataVersion` string has been incremented in the `DESCRIPTION` file. +- If it has not the build process will exit and produce an error message. + +#### DATADIGEST + + +The `DATADIGEST` file contains the following: + + +``` +DataVersion: 0.1.0 +cars_over_20: 3ccb5b0aaa74fe7cfc0d3ca6ab0b5cf3 +``` + + +#### DESCRIPTION + +The description file has the new `DataVersion` string. + + +``` +Package: mtcars20 +Type: Package +Title: What the package does (short line) +Version: 1.0 +Date: 2018-07-05 +Author: Who wrote it +Maintainer: Who to complain to +Description: More about what it does (maybe more than one line) +License: What license is it under? +DataVersion: 0.1.0 +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr +RoxygenNote: 6.0.1 +``` + +## Why not use R CMD build? + +If the processing script is time consuming or the data set is particularly large, then `R CMD build` would run the code each time the package is installed. In such cases, raw data may not be available, or the environment to do the data processing may not be set up for each user of the data. In such cases, DataPackageR provides a mechanism to decouple data processing from package building/installation for downstream users of the data.