Skip to content

Commit

Permalink
Merge ca0593a into a7eb23e
Browse files Browse the repository at this point in the history
  • Loading branch information
pablo-gar authored Jun 12, 2023
2 parents a7eb23e + ca0593a commit acd83e3
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 72 deletions.
5 changes: 5 additions & 0 deletions apis/r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

S3method("[[",MappingBase)
S3method("[[<-",MappingBase)
S3method(as.data.frame,SOMASparseNDArrayRead)
S3method(as.data.frame,TableReadIter)
S3method(as.list,MappingBase)
S3method(as_arrow_table,SOMASparseNDArrayRead)
S3method(as_arrow_table,TableReadIter)
S3method(length,MappingBase)
S3method(names,MappingBase)
S3method(pad_matrix,default)
Expand Down Expand Up @@ -78,6 +82,7 @@ importFrom(Matrix,as.matrix)
importFrom(Matrix,sparseMatrix)
importFrom(Rcpp,evalCpp)
importFrom(arrow,RecordBatch)
importFrom(arrow,as_arrow_table)
importFrom(arrow,concat_arrays)
importFrom(bit64,as.integer64)
importFrom(bit64,lim.integer64)
Expand Down
50 changes: 50 additions & 0 deletions apis/r/R/utils-coercions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#' Coercion methods for SOMA classes

# Set R6 methods formally as S3 to create apply generics
methods::setOldClass("SOMASparseNDArrayRead")
methods::setOldClass("TableReadIter")

#' @importClassesFrom Matrix TsparseMatrix CsparseMatrix RsparseMatrix
NULL

#' @importFrom arrow as_arrow_table
#' @export
arrow::as_arrow_table


#' @importFrom arrow as_arrow_table
#' @export
as_arrow_table.SOMASparseNDArrayRead <- function(x){
x$tables()$concat()
}

#' @export
as.data.frame.SOMASparseNDArrayRead <- function(x, ...){
as.data.frame(x$tables()$concat(), ...)
}

# Coerce \link[tiledbsoma]{SOMASparseNDArrayRead} to Matrix::\link[Matrix]{dgTMatrix}
setAs(from = "SOMASparseNDArrayRead",
to = "TsparseMatrix",
def = function(from) from$sparse_matrix()$concat()
)

# Coerce \link[tiledbsoma]{SOMASparseNDArrayRead} to Matrix::\link[Matrix]{dgCMatrix}
setAs(from = "SOMASparseNDArrayRead",
to = "CsparseMatrix",
def = function(from) as(as(from, "TsparseMatrix"), "CsparseMatrix")
)

# Coerce \link[tiledbsoma]{SOMASparseNDArrayRead} to Matrix::\link[Matrix]{dgRMatrix}
setAs(from = "SOMASparseNDArrayRead",
to = "RsparseMatrix",
def = function(from) as(as(from, "TsparseMatrix"), "RsparseMatrix")
)

#' @export
as_arrow_table.TableReadIter <- function(x) x$concat()

#' @export
as.data.frame.TableReadIter <- function(x, row.names = NULL, optional = FALSE, ...){
as.data.frame(x$concat(), row.names = row.names, optional = optional, ...)
}
3 changes: 1 addition & 2 deletions apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,12 @@ test_that("Arrow Interface from SOMAArrayReader", {
expect_true(all(tb2$n_counts < 1000))
expect_true(all(tb2$n_genes >= 400))


# read everything
tb3 <- soma_array_to_arrow_table(soma_array_reader(uri))

expect_equal(tb3$num_rows, 2638)
expect_equal(tb3$num_columns, 6)

# read a subset of rows and columns
tb4 <- soma_array_reader(uri = uri,
colnames = c("obs_id", "percent_mito", "n_counts", "louvain"),
Expand Down
10 changes: 9 additions & 1 deletion apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ test_that("SOMADataFrame read", {
expect_equal(z$num_columns, 3L)
expect_equal(z$ColumnNames(), columns)
sdf$close()

columns <- c("n_counts", "does_not_exist")
sdf <- SOMADataFrameOpen(uri)
expect_error(sdf$read(column_names=columns))
Expand All @@ -233,6 +233,14 @@ test_that("SOMADataFrame read", {
z <- sdf$read(coords = list(soma_joinid=coords))$concat()
expect_equal(z$num_rows, 10L)
sdf$close()

# coercion from TableReader to arrow Table and data.frame
sdf <- SOMADataFrameOpen(uri)
all.equal(sdf$read()$concat(),
arrow::as_arrow_table(sdf$read()))
all.equal(as.data.frame(sdf$read()$concat()),
as.data.frame(sdf$read()))
sdf$close()
})

test_that("soma_ prefix is reserved", {
Expand Down
103 changes: 66 additions & 37 deletions apis/r/tests/testthat/test-SOMASparseNDArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,69 +73,98 @@ test_that("SOMASparseNDArray creation", {

})

test_that("SOMASparseNDArray read_sparse_matrix", {
test_that("SOMASparseNDArray read", {
uri <- withr::local_tempdir("sparse-ndarray")
ndarray <- SOMASparseNDArrayCreate(uri, arrow::int32(), shape = c(10, 10))

# For this test, write 9x9 data into 10x10 array. Leaving the last row & column
# empty touches corner cases with setting dims() correctly
mat <- create_sparse_matrix_with_int_dims(9, 9)
mat <- create_sparse_matrix_with_int_dims(9, 9, repr = "T")
ndarray$write(mat)
expect_equal(as.numeric(ndarray$shape()), c(10, 10))
ndarray$close()

# read_sparse_matrix

ndarray <- SOMASparseNDArrayOpen(uri)
mat2 <- ndarray$read()$sparse_matrix(zero_based = T)$concat()

# read sparse matrix directly
mat2 <- ndarray$read()$sparse_matrix(zero_based = F)$concat()
expect_s4_class(mat2, "sparseMatrix")
expect_s4_class(mat2, "dgTMatrix")
expect_equal(dim(mat2), c(10, 10))
expect_equal(nrow(mat2), 10)
expect_equal(ncol(mat2), 10)
all.equal(mat, mat2)
all.equal(mat[1:9, 1:9], mat2[1:9, 1:9])

# test coerced matrix: dgTMatrix
mat2 <- ndarray$read()$sparse_matrix(zero_based = F)$concat()
mat_coerced <- as(ndarray$read(), "TsparseMatrix")
expect_s4_class(mat_coerced, "dgTMatrix")
all.equal(mat2, mat_coerced)

# test coerced matrix: dgCMatrix
mat2 <- ndarray$read()$sparse_matrix(zero_based = F)$concat()
mat_coerced <- as(ndarray$read(), "CsparseMatrix")
expect_s4_class(mat_coerced, "dgCMatrix")
all.equal(as(mat2, "CsparseMatrix"), mat_coerced)

# test coerced matrix: dgTMatrix
mat2 <- ndarray$read()$sparse_matrix(zero_based = F)$concat()
mat_coerced <- as(ndarray$read(), "RsparseMatrix")
expect_s4_class(mat_coerced, "dgRMatrix")
all.equal(as(mat2, "RsparseMatrix"), mat_coerced)


# test zero-based Matrix view
mat2 <- ndarray$read()$sparse_matrix(zero_based=T)$concat()
expect_true(inherits(mat2, "matrixZeroBasedView"))
expect_s4_class(mat2$get_one_based_matrix(), "sparseMatrix")
expect_s4_class(mat2$get_one_based_matrix(), "dgTMatrix")
expect_equal(mat2$dim(), c(10, 10))
expect_equal(mat2$nrow(), 10)
expect_equal(mat2$ncol(), 10)
## not sure why all.equal(mat, mat2) does not pass
expect_true(all.equal(as.numeric(mat[1:9, 1:9]), as.numeric(mat2$take(0:8, 0:8)$get_one_based_matrix())))
expect_equal(sum(mat), sum(mat2$get_one_based_matrix()))

ndarray <- SOMASparseNDArrayOpen(uri)

ndarray$close()
})

test_that("SOMASparseNDArray read_sparse_matrix_zero_based", {
uri <- withr::local_tempdir("sparse-ndarray")
ndarray <- SOMASparseNDArrayCreate(uri, arrow::int32(), shape = c(10, 10))

# For this test, write 9x9 data into 10x10 array. Leaving the last row & column
# empty touches corner cases with setting dims() correctly
mat <- create_sparse_matrix_with_int_dims(9, 9)
ndarray$write(mat)
expect_equal(as.numeric(ndarray$shape()), c(10, 10))
ndarray$close()

# read_sparse_matrix
ndarray <- SOMASparseNDArrayOpen(uri)
mat2 <- ndarray$read()$sparse_matrix(zero_based=T)$concat()
all.equal(mat, mat2$get_one_based_matrix)
all.equal(mat, mat2$take(0:8,0:8)$get_one_based_matrix())

# test zero-based view and convert to sparse matrix
mat2 <- ndarray$read()$sparse_matrix(zero_based = T)$concat()
expect_true(inherits(mat2, "matrixZeroBasedView"))
expect_s4_class(mat2$get_one_based_matrix(), "sparseMatrix")
expect_equal(mat2$dim(), c(10, 10))
expect_equal(mat2$nrow(), 10)
expect_equal(mat2$ncol(), 10)
## not sure why all.equal(mat, mat2) does not pass
expect_true(all.equal(as.numeric(mat), as.numeric(mat2$take(0:8,0:8)$get_one_based_matrix())))
expect_equal(sum(mat), sum(mat2$get_one_based_matrix()))

ndarray <- SOMASparseNDArrayOpen(uri)
all.equal(mat, mat2)
all.equal(mat[1:9, 1:9], mat2$take(0:8, 0:8)$get_one_based_matrix())

# repeat with iterated reader
# test with iterated reader
iterator <- ndarray$read()$sparse_matrix(zero_based = T)
mat2 <- iterator$read_next()
expect_true(inherits(mat2, "matrixZeroBasedView"))
expect_s4_class(mat2$get_one_based_matrix(), "sparseMatrix")
expect_equal(mat2$dim(), c(10, 10))
expect_equal(mat2$nrow(), 10)
expect_equal(mat2$ncol(), 10)
expect_true(all.equal(as.numeric(mat), as.numeric(mat2$take(0:8,0:8)$get_one_based_matrix())))
expect_equal(sum(mat), sum(mat2$get_one_based_matrix()))
all.equal(mat, mat2$get_one_based_matrix)
all.equal(mat, mat2$take(0:8,0:8)$get_one_based_matrix())

# test arrow table reader
df <- data.frame(soma_dim_0 = mat@i, soma_dim_1 = mat@j, soma_data = mat@x)
tbl <- arrow::arrow_table(df[order(df$soma_dim_0, df$soma_dim_1),])
tbl2 <- ndarray$read(result_order = "ROW_MAJOR")$tables()$concat()
all.equal(tbl, tbl2)

# test arrow table coercion
df <- data.frame(soma_dim_0 = mat@i, soma_dim_1 = mat@j, soma_data = mat@x)
tbl <- arrow::arrow_table(df[order(df$soma_dim_0, df$soma_dim_1),])
tbl2 <- arrow::as_arrow_table(ndarray$read(result_order = "ROW_MAJOR"))
all.equal(tbl, tbl2)

# test data.frame coercion
df <- data.frame(soma_dim_0 = mat@i, soma_dim_1 = mat@j, soma_data = mat@x)
df <- df[order(df$soma_dim_0, df$soma_dim_1),]
df2 <- as.data.frame(ndarray$read(result_order = "ROW_MAJOR"))
expect_true(inherits(df2, "data.frame"))
all.equal(df, df2)

ndarray$close()
})

Expand Down
85 changes: 70 additions & 15 deletions apis/r/vignettes/soma-objects.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -71,39 +71,55 @@ experiment$obs$schema()

Note that `soma_joinid` is a field that exists in every `SOMADataFrame` and acts as a join key for other objects in the dataset.

Again, when a SOMA object is accessed, only a pointer is returned and no data is read into memory. To load the data in memory, we call `read()$concat()`, which returns an [Arrow Table](https://arrow.apache.org/docs/r/reference/Table.html) and is easily converted to a data frame by appending `$to_data_frame()`.
When a SOMADataFrame is read an iterator is returned. To load the data in memory, we can coerce the iterator from `$read()` with `as.data.frame` or `arrow:as_arrow_table()`, which respectively return a `data.frame` or an [Arrow Table](https://arrow.apache.org/docs/r/reference/Table.html).

```{r}
experiment$obs$read()$concat()
as.data.frame(experiment$obs$read())
```

The amount of data that can be read at once is determined by the `soma.init_buffer_bytes` configuration parameter, which, by default, is set to 16MB for each column. If the requested data is larger than this value an error will be thrown.
```{r}
arrow::as_arrow_table(experiment$obs$read())
```

If your system has more memory, you can increase this parameter to a larger value to read in more data at once. Alternatively, you can use the iterated reader, which retrieves data in chunks that are smaller than the `soma.init_buffer_bytes` parameter. The result of which is a list of Arrow Tables.
Alternatively, you can use the iterator, which retrieves data in chunks that are smaller than the `soma.init_buffer_bytes` parameter. You can user the iterator's method `$read_next()` to load a chunk
in memory.

```{r}
iterator <- experiment$obs$read()
iterator$read_next()
```

In this example the full table is relatively small and fits all in one chunk.

For bigger `SOMADataFrame`s you can check if the iteration has finished by checking the logical `$read_complete()`, and you can concatenate the rest of the chunks by using `$concat()`.

Here we demonstrate by creating a fresh new iterator.

```{r}
iterator <- experiment$obs$read()
iterator$read_complete()
```
```{r}
iterator$concat()
```
We can also select a subset of rows from the `SOMADataFrame` using the `coords` argument. This will retrieve only the required subset from disk to memory. In this example, we will select only the first 10 rows:

*NOTE: The `coords` argument is 0-based.*

```{r}
experiment$obs$read(coords = 0:9)$concat()
as.data.frame(experiment$obs$read(coords = 0:9))
```

As TileDB is a columnar format, we can also select a subset of the columns:

```{r}
experiment$obs$read(0:9, column_names = c("obs_id", "nCount_RNA"))$concat()
as.data.frame(experiment$obs$read(0:9, column_names = c("obs_id", "nCount_RNA")))
```

Finally, we can use `value_filter` to retrieve a subset of rows that match a certain condition.

```{r}
experiment$obs$read(value_filter = "nCount_RNA > 100")$concat()
as.data.frame(experiment$obs$read(value_filter = "nCount_RNA > 100"))
```

And of course, you can combine all of these arguments together to get at only the data you need.
Expand Down Expand Up @@ -180,19 +196,58 @@ We can get the number of non-zero elements by calling `nnz()`:
X_data$nnz()
```

Let's load the data as a sparse matrix into memory:
Let's load the data as a sparse matrix into memory. The following function are availabe to coerce the output of `$read()`:

- `arrow::as_arrow_table(x)` to an Arrow `Table`.
- `as(x, "TsparseMatrix")` to a `dgTMatrix` from the Matrix package.
- `as(x, "CsparseMatrix")` to a `dgCMatrix` from the Matrix package.
- `as(x, "RsparseMatrix")` to a `dgRMatrix` from the Matrix package.

Keep in mind that dgTMatrix is the most memory-efficient option from the Matrix package.

```{r}
arrow::as_arrow_table(X_data$read())
```
```{r}
sparse <- as(X_data$read(), "TsparseMatrix")
str(sparse)
```

```{r}
sparse <- as(X_data$read(), "CsparseMatrix")
str(sparse)
```

```{r}
sparse <- as(X_data$read(), "RsparseMatrix")
str(sparse)
```

*Note: We are reading the full matrix into memory and then subsetting it to
the first 5 rows and 10 columns to truncate the output. This uses the
zero-based underlying representation access but then accesses a one-based
view as the sparse matrix functionality from package `Matrix` imposes this.*
Similarly to `SOMADataFrame`s, with the `$read()` method we can define coordinates to slice obtain a subset of the matrix from disk:

```{r}
X_data$read()$sparse_matrix()$concat()[1:5, 1:10]
arrow::as_arrow_table((X_data$read(coords = list(soma_dim_0 = 0:4, soma_dim_1 = 0:9))))
```

Similarly to `SOMADataFrame`s, `read()` method we can define coordinates to slice obtain a subset of the matrix from disk:
And you can also iterate by chunks. However with `SOMASparseNDArray` the method `$read()` returns a reader
that gives you access to 2 different iterators, one for Arrow Table and one for dgTMatrix.

Let's take a look at the Arrow Table iterator:

```{r}
X_data$read(coords = list(soma_dim_0 = 0:4, soma_dim_1 = 0:9))$sparse_matrix()$concat()
reader <- X_data$read()
iterator <- reader$tables()
iterator$read_next()
```
Similar to the `SOMADataFrame` example the data is small enough to fit in one chunk. For bigger data
you can user `iterator$read_complete()` to check the status of iteration and `iterator$concat()`
to concatenate the rest of the chunks.

The dgTMatrix iterator works in the same way:

```{r}
reader <- X_data$read()
iterator <- reader$sparse_matrix()
str(iterator$read_next())
```

Loading

0 comments on commit acd83e3

Please sign in to comment.