Skip to content

Commit

Permalink
Clean up parquet classes and document the R6
Browse files Browse the repository at this point in the history
  • Loading branch information
nealrichardson committed Sep 10, 2019
1 parent 85a8d36 commit 3e4cfe7
Show file tree
Hide file tree
Showing 11 changed files with 197 additions and 121 deletions.
1 change: 1 addition & 0 deletions dev/release/rat_exclude_files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ r/README.md
r/README.Rmd
r/man/*.Rd
r/cran-comments.md
r/vignettes/*.Rmd
.gitattributes
ruby/red-arrow/.yardopts
rust/arrow/test/data/*.csv
Expand Down
7 changes: 2 additions & 5 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ S3method(dim,RecordBatch)
S3method(dim,Table)
S3method(length,Array)
S3method(names,RecordBatch)
S3method(parquet_file_reader,RandomAccessFile)
S3method(parquet_file_reader,character)
S3method(parquet_file_reader,raw)
S3method(print,"arrow-enum")
S3method(read_message,InputStream)
S3method(read_message,MessageReader)
Expand Down Expand Up @@ -48,6 +45,8 @@ export(DateUnit)
export(FileMode)
export(MessageReader)
export(MessageType)
export(ParquetFileReader)
export(ParquetReaderProperties)
export(StatusCode)
export(Table)
export(TimeUnit)
Expand Down Expand Up @@ -95,8 +94,6 @@ export(mmap_open)
export(null)
export(num_range)
export(one_of)
export(parquet_arrow_reader_properties)
export(parquet_file_reader)
export(read_arrow)
export(read_csv_arrow)
export(read_delim_arrow)
Expand Down
2 changes: 1 addition & 1 deletion r/R/Table.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#'
#' The `Table$create()` function takes the following arguments:
#'
#' * `...`` arrays, chunked arrays, or R vectors
#' * `...` arrays, chunked arrays, or R vectors
#' * `schema` a schema. The default (`NULL`) infers the schema from the `...`
#'
#' @section Methods:
Expand Down
187 changes: 114 additions & 73 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,79 @@
# specific language governing permissions and limitations
# under the License.

#' @include arrow-package.R

`parquet::arrow::FileReader` <- R6Class("parquet::arrow::FileReader",
#' Read a Parquet file
#'
#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format.
#' This function enables you to read Parquet files into R.
#'
#' @inheritParams read_delim_arrow
#' @param props [ParquetReaderProperties]
#'
#' @return A [arrow::Table][Table], or a `data.frame` if `as_tibble` is
#' `TRUE`.
#' @examples
#' \donttest{
#' df <- read_parquet(system.file("v0.7.1.parquet", package="arrow"))
#' head(df)
#' }
#' @export
read_parquet <- function(file,
col_select = NULL,
as_tibble = TRUE,
props = ParquetReaderProperties$create(),
...) {
reader <- ParquetFileReader$create(file, props = props, ...)
tab <- reader$ReadTable(!!enquo(col_select))

if (as_tibble) {
tab <- as.data.frame(tab)
}
tab
}

#' @title ParquetFileReader class
#' @rdname ParquetFileReader
#' @name ParquetFileReader
#' @docType class
#' @usage NULL
#' @format NULL
#' @description This class enables you to interact with Parquet files.
#'
#' @section Factory:
#'
#' The `ParquetFileReader$create()` factor method instantiates the object and
#' takes the following arguments:
#'
#' - `file` A character file name, raw vector, or Arrow file connection object
#' (e.g. `RandomAccessFile`).
#' - `props` Optional [ParquetReaderProperties]
#' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
#' - `...` Additional arguments, currently ignored
#'
#' @section Methods:
#'
#' - `$ReadTable(col_select)`: get an `arrow::Table` from the file, possibly
#' with columns filtered by a character vector of column names or a
#' `tidyselect` specification.
#' - `$GetSchema()`: get the `arrow::Schema` of the data in the file
#'
#' @export
#' @examples
#' \donttest{
#' f <- system.file("v0.7.1.parquet", package="arrow")
#' pq <- ParquetFileReader$create(f)
#' pq$GetSchema()
#' tab <- pq$ReadTable()
#' tab$schema
#' }
#' @include arrow-package.R
ParquetFileReader <- R6Class("ParquetFileReader",
inherit = Object,
public = list(
ReadTable = function(col_select = NULL) {
col_select <- enquo(col_select)
if(quo_is_null(col_select)) {
if (quo_is_null(col_select)) {
shared_ptr(Table, parquet___arrow___FileReader__ReadTable1(self))
} else {
all_vars <- shared_ptr(Schema, parquet___arrow___FileReader__GetSchema(self))$names
Expand All @@ -36,7 +101,47 @@
)
)

`parquet::arrow::ArrowReaderProperties` <- R6Class("parquet::arrow::ArrowReaderProperties",
ParquetFileReader$create <- function(file,
props = ParquetReaderProperties$create(),
mmap = TRUE,
...) {
if (is.character(file)) {
if (isTRUE(mmap)) {
file <- mmap_open(file)
} else {
file <- ReadableFile$create(file)
}
} else if (is.raw(file)) {
file <- BufferReader$create(file)
}
assert_that(inherits(file, "RandomAccessFile"))
assert_that(inherits(props, "ParquetReaderProperties"))

unique_ptr(ParquetFileReader, parquet___arrow___FileReader__OpenFile(file, props))
}

#' @title ParquetReaderProperties class
#' @rdname ParquetReaderProperties
#' @name ParquetReaderProperties
#' @docType class
#' @usage NULL
#' @format NULL
#' @description This class holds settings to control how a Parquet file is read
#' by [ParquetFileReader].
#'
#' @section Factory:
#'
#' The `ParquetReaderProperties$create()` factor method instantiates the object
#' and takes the following arguments:
#'
#' - `use_threads` Logical: whether to use multithreading (default `TRUE`)
#'
#' @section Methods:
#'
#' TODO
#'
#' @export
ParquetReaderProperties <- R6Class("ParquetReaderProperties",
inherit = Object,
public = list(
read_dictionary = function(column_index) {
Expand All @@ -57,77 +162,13 @@
)
)

#' Create a new ArrowReaderProperties instance
#'
#' @param use_threads use threads?
#'
#' @export
#' @keywords internal
parquet_arrow_reader_properties <- function(use_threads = option_use_threads()) {
shared_ptr(`parquet::arrow::ArrowReaderProperties`, parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads)))
}

#' Parquet file reader
#'
#' @inheritParams read_delim_arrow
#' @param props reader file properties, as created by [parquet_arrow_reader_properties()]
#'
#' @param ... additional parameters
#'
#' @export
parquet_file_reader <- function(file, props = parquet_arrow_reader_properties(), ...) {
UseMethod("parquet_file_reader")
}

#' @export
parquet_file_reader.RandomAccessFile <- function(file, props = parquet_arrow_reader_properties(), ...) {
unique_ptr(`parquet::arrow::FileReader`, parquet___arrow___FileReader__OpenFile(file, props))
}

#' @export
parquet_file_reader.character <- function(file,
props = parquet_arrow_reader_properties(),
memory_map = TRUE,
...) {
file <- normalizePath(file)
if (isTRUE(memory_map)) {
parquet_file_reader(mmap_open(file), props = props, ...)
} else {
parquet_file_reader(ReadableFile$create(file), props = props, ...)
}
}

#' @export
parquet_file_reader.raw <- function(file, props = parquet_arrow_reader_properties(), ...) {
parquet_file_reader(BufferReader$create(file), props = props, ...)
ParquetReaderProperties$create <- function(use_threads = option_use_threads()) {
shared_ptr(
ParquetReaderProperties,
parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads))
)
}

#' Read a Parquet file
#'
#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format.
#' This function enables you to read Parquet files into R.
#'
#' @inheritParams read_delim_arrow
#' @inheritParams parquet_file_reader
#'
#' @return A [arrow::Table][Table], or a `data.frame` if `as_tibble` is
#' `TRUE`.
#' @examples
#' \donttest{
#' try({
#' df <- read_parquet(system.file("v0.7.1.parquet", package="arrow"))
#' })
#' }
#' @export
read_parquet <- function(file, col_select = NULL, as_tibble = TRUE, props = parquet_arrow_reader_properties(), ...) {
reader <- parquet_file_reader(file, props = props, ...)
tab <- reader$ReadTable(!!enquo(col_select))

if (as_tibble) {
tab <- as.data.frame(tab)
}
tab
}

#' Write Parquet file to disk
#'
Expand Down
5 changes: 4 additions & 1 deletion r/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,17 @@ reference:
- write_arrow
- write_feather
- write_parquet
- title: C++ reader/writer interface
contents:
- csv_convert_options
- csv_parse_options
- csv_read_options
- csv_table_reader
- json_parse_options
- json_read_options
- json_table_reader
- parquet_file_reader
- ParquetFileReader
- ParquetReaderProperties
- FeatherTableReader
- FeatherTableWriter
- JsonTableReader
Expand Down
43 changes: 43 additions & 0 deletions r/man/ParquetFileReader.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions r/man/ParquetReaderProperties.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/Table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 0 additions & 15 deletions r/man/parquet_arrow_reader_properties.Rd

This file was deleted.

18 changes: 0 additions & 18 deletions r/man/parquet_file_reader.Rd

This file was deleted.

Loading

0 comments on commit 3e4cfe7

Please sign in to comment.