Use arrow format for datasets (#382)

* Use arrow format for datasets * Arrow version at 0.3 - thanks for @quinnj for all his work on this version of Arrow * Removes direct dependency on `Feather.jl` and indirect dependencies on `DataFrames.jl` and `CategoricalArrays.jl`
JuliaStats · Oct 5, 2020 · b7402e2 · b7402e2
1 parent 52874ee
commit b7402e2
Show file tree

Hide file tree

Showing 11 changed files with 39 additions and 28 deletions.
diff --git a/Artifacts.toml b/Artifacts.toml
@@ -5,21 +5,21 @@
 [TestData]
 # compute this using
 # using Tar, Inflate, SHA
-# filename = "download?version=4" # I just used wget for the URL below and this is how it saved it
+# filename = "download?version=2" # I just used wget for the URL below and this is how it saved it
 # println("sha256: ", bytes2hex(open(sha256, filename)))
 # println("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(filename))))
 # from https://julialang.github.io/Pkg.jl/dev/artifacts/
-git-tree-sha1 = "4d1410cd290622e426411273ef379fe82b749ca4"
+git-tree-sha1 = "a3955a5f747d01e628944b1031b44e31f027ebef"
 lazy = true
 
     [[TestData.download]]
-    # this is the SHA from https://osf.io/pcjk6/?show=revision
-    sha256 = "b66369456c0ec9d1490d61d0c0686999e6422051295aeb9e14ab27693ccaec54"
+    # this is the SHA from https://osf.io/djaqb/download?version=2
+    sha256 = "b6273f0cfeb5b12e2afede33de6d68a8d926e7b684cf071c7622f1e6ef7aa64a"
     # when updating this, make sure to change to change the version number,
     # because if the version number isn't included, it will always point to the
     # latest version, which means it will break existing users when we update
     # between releases.
-    url = "https://osf.io/pcjk6/download?version=4"
+    url = "https://osf.io/djaqb/download?version=2"
 
     # for future work on using xz-compressed data:
     # Julia invokes wget without using HTTP metadata, so we need the link

diff --git a/Project.toml b/Project.toml
@@ -4,10 +4,10 @@ author = ["Phillip Alday <[email protected]>", "Douglas Bates <[email protected]
 version = "3.0.0-DEV"
 
 [deps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
-Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
@@ -24,10 +24,10 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
+Arrow = "0.3"
 BlockArrays = "0.11, 0.12"
 DataAPI = "1.1, 1.2, 1.3"
 Distributions = "0.21, 0.22, 0.23"
-Feather = "0.5"
 GLM = "1"
 NLopt = "0.5, 0.6"
 PooledArrays = "0.5"

diff --git a/src/MixedModels.jl b/src/MixedModels.jl
@@ -1,9 +1,9 @@
 module MixedModels
 
+using Arrow
 using BlockArrays
 using DataAPI
 using Distributions
-using Feather
 using GLM
 using LinearAlgebra
 using NLopt

diff --git a/src/linearmixedmodel.jl b/src/linearmixedmodel.jl
@@ -70,7 +70,8 @@ function LinearMixedModel(
     y, Xs = modelcols(form, tbl)
 
     y = reshape(float(y), (:, 1)) # y as a floating-point matrix
-    T = eltype(y)
+    T = promote_type(Float64, eltype(y))  # ensure that eltype of model matrices is at least Float64
+    y = convert(Matrix{T}, y)
 
     reterms = AbstractReMat{T}[]
     feterms = FeMat{T}[]

diff --git a/src/utilities.jl b/src/utilities.jl
@@ -138,20 +138,20 @@ function replicate(f::Function, n::Integer; use_threads=false)
     results
 end
 
-cacheddatasets = Dict{String,Any}()
+cacheddatasets = Dict{String, Arrow.Table}()
 """
     dataset(nm)
 
 Return the data frame of test data set named `nm`, which can be a `String` or `Symbol`
 """
 function dataset(nm::AbstractString)
     get!(cacheddatasets, nm) do
-        path = joinpath(TestData, nm * ".feather")
+        path = joinpath(TestData, nm * ".arrow")
         if !isfile(path)
             throw(ArgumentError(
                 "Dataset \"$nm\" is not available.\nUse MixedModels.datasets() for available names."))
         end
-        Feather.read(path)
+        Arrow.Table(path)
     end
 end
 dataset(nm::Symbol) = dataset(string(nm))
@@ -161,7 +161,7 @@ dataset(nm::Symbol) = dataset(string(nm))
 
 Return a vector of names of the available test data sets
 """
-datasets() = first.(Base.Filesystem.splitext.(filter(Base.Fix2(endswith, ".feather"), readdir(TestData))))
+datasets() = first.(Base.Filesystem.splitext.(filter(endswith(".arrow"), readdir(TestData))))
 
 
 """

diff --git a/test/FactorReTerm.jl b/test/FactorReTerm.jl
@@ -1,3 +1,4 @@
+using DataFrames
 using LinearAlgebra
 using MixedModels
 using Random
@@ -101,8 +102,9 @@ end
 end
 
 @testset "Categorical Blocking Variable" begin
-    # deepcopy because we're going to modify it
-    slp = deepcopy(dataset("sleepstudy"))
+    # deepcopy because we're going to modify it.  Don't need the copy if dataset returns an Arrow.Table
+    #slp = deepcopy(DataFrame(dataset("sleepstudy")))
+    slp = DataFrame(dataset("sleepstudy"))
     contrasts =  Dict{Symbol,Any}()
     f = @formula(reaction ~ 1 + (1|subj))
 

diff --git a/test/bootstrap.jl b/test/bootstrap.jl
@@ -65,4 +65,4 @@ end
         @test sort(columntable(bsamp_threaded.β).β) == sort(columntable(bsamp.β).β)
         @test sum(issingular(bsamp)) == sum(issingular(bsamp_threaded))
     end
-end
+end
diff --git a/test/missing.jl b/test/missing.jl
@@ -1,8 +1,10 @@
-using MixedModels, Test
+using DataFrames
+using MixedModels
+using Test
 
-# deepcopy because we're going to modify it
-slp = deepcopy(MixedModels.dataset(:sleepstudy))
-slp[!,:days] = Array{Union{Missing, Float64},1}(slp[!,:days])
+# convert to DataFrame to modify it
+slp = DataFrame(MixedModels.dataset(:sleepstudy))
+allowmissing!(slp, :days)
 slp[1,:days] = missing
 
 # TODO: re-enable this test when better missing support has landed in StatsModels

diff --git a/test/pirls.jl b/test/pirls.jl
@@ -1,4 +1,6 @@
-using MixedModels, Tables
+using DataFrames
+using MixedModels
+using Tables
 using Test
 
 using MixedModels: dataset
@@ -81,7 +83,7 @@ end
 
 @testset "grouseticks" begin
     center(v::AbstractVector) = v .- (sum(v) / length(v))
-    grouseticks = dataset(:grouseticks)
+    grouseticks = DataFrame(dataset(:grouseticks))
     grouseticks.ch = center(grouseticks.height)
     gm4 = fit(MixedModel, only(gfms[:grouseticks]), grouseticks, Poisson(), fast=true)  # fails in pirls! with fast=false
     @test isapprox(deviance(gm4), 851.4046, atol=0.001)

diff --git a/test/pls.jl b/test/pls.jl
@@ -69,15 +69,15 @@ include("modelcache.jl")
     @test fm1.σ ≈ 49.510099986291145 atol=1.e-5
     @test fm1.X == ones(30,1)
     ds = MixedModels.dataset(:dyestuff)
-    @test fm1.y == ds[!, :yield]
+    @test fm1.y == ds[:yield]
     @test cond(fm1) == ones(1)
     @test first(leverage(fm1)) ≈ 0.15650534392640486 rtol=1.e-5
     @test sum(leverage(fm1)) ≈ 4.695160317792145 rtol=1.e-5
     cm = coeftable(fm1)
     @test length(cm.rownms) == 1
     @test length(cm.colnms) == 4
     @test fnames(fm1) == (:batch,)
-    @test response(fm1) == ds[!, :yield]
+    @test response(fm1) == ds[:yield]
     rfu = ranef(fm1, uscale = true)
     rfb = ranef(fm1)
     @test abs(sum(rfu[1])) < 1.e-5
@@ -127,7 +127,7 @@ end
     @test coef(fm) ≈ [5.6656]
     @test logdet(fm) ≈ 0.0
     @test issingular(fm)
-    refit!(fm, float(MixedModels.dataset(:dyestuff)[!, :yield]))
+    refit!(fm, float(MixedModels.dataset(:dyestuff)[:yield]))
     @test objective(fm) ≈ 327.3270598811428 atol=0.001
 end
 
@@ -333,7 +333,7 @@ end
     @test logdet(fm_ind) ≈ logdet(fmnc)
 
     # combining [ReMat{T,S1}, ReMat{T,S2}] for S1 ≠ S2
-    slpcat = categorical!(deepcopy(slp), [:days])
+    slpcat = categorical!(DataFrame(slp), [:days])
     fm_cat = fit(MixedModel, @formula(reaction ~ 1+days+(1|subj)+(0+days|subj)),slpcat)
     @test fm_cat isa LinearMixedModel
     σρ = fm_cat.σρs
@@ -391,6 +391,7 @@ end
 end
 
 @testset "kb07" begin
+    global io
     pca = last(models(:kb07)).PCA
     @test keys(pca) == (:subj, :item)
     show(io, models(:kb07)[2])

diff --git a/test/utilities.jl b/test/utilities.jl
@@ -51,8 +51,11 @@ end
 
 @testset "datasets" begin
 	@test isa(MixedModels.datasets(), Vector{String})
-	@test size(MixedModels.dataset(:dyestuff)) == (30, 2)
-	@test size(MixedModels.dataset("dyestuff")) == (30, 2)
+	@test length(MixedModels.dataset(:dyestuff)) == 2
+	@test length(MixedModels.dataset("dyestuff")) == 2
+	dyestuff = MixedModels.dataset(:dyestuff);
+	@test keys(dyestuff) == [:batch, :yield]
+	@test length(dyestuff.batch) == 30
 	@test_throws ArgumentError MixedModels.dataset(:foo)
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -65,4 +65,4 @@ end @@
             @test sort(columntable(bsamp_threaded.β).β) == sort(columntable(bsamp.β).β)
             @test sum(issingular(bsamp)) == sum(issingular(bsamp_threaded))
         end
-    end
+    end