Skip to content

Commit

Permalink
Use arrow format for datasets (#382)
Browse files Browse the repository at this point in the history
* Use arrow format for datasets

* Arrow version at 0.3 - thanks for @quinnj for all his work on this version of Arrow

* Removes direct dependency on `Feather.jl` and indirect dependencies on `DataFrames.jl` and `CategoricalArrays.jl`
  • Loading branch information
dmbates authored Oct 5, 2020
1 parent 52874ee commit b7402e2
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 28 deletions.
10 changes: 5 additions & 5 deletions Artifacts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
[TestData]
# compute this using
# using Tar, Inflate, SHA
# filename = "download?version=4" # I just used wget for the URL below and this is how it saved it
# filename = "download?version=2" # I just used wget for the URL below and this is how it saved it
# println("sha256: ", bytes2hex(open(sha256, filename)))
# println("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(filename))))
# from https://julialang.github.io/Pkg.jl/dev/artifacts/
git-tree-sha1 = "4d1410cd290622e426411273ef379fe82b749ca4"
git-tree-sha1 = "a3955a5f747d01e628944b1031b44e31f027ebef"
lazy = true

[[TestData.download]]
# this is the SHA from https://osf.io/pcjk6/?show=revision
sha256 = "b66369456c0ec9d1490d61d0c0686999e6422051295aeb9e14ab27693ccaec54"
# this is the SHA from https://osf.io/djaqb/download?version=2
sha256 = "b6273f0cfeb5b12e2afede33de6d68a8d926e7b684cf071c7622f1e6ef7aa64a"
# when updating this, make sure to change to change the version number,
# because if the version number isn't included, it will always point to the
# latest version, which means it will break existing users when we update
# between releases.
url = "https://osf.io/pcjk6/download?version=4"
url = "https://osf.io/djaqb/download?version=2"

# for future work on using xz-compressed data:
# Julia invokes wget without using HTTP metadata, so we need the link
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ author = ["Phillip Alday <[email protected]>", "Douglas Bates <[email protected]
version = "3.0.0-DEV"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
Expand All @@ -24,10 +24,10 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Arrow = "0.3"
BlockArrays = "0.11, 0.12"
DataAPI = "1.1, 1.2, 1.3"
Distributions = "0.21, 0.22, 0.23"
Feather = "0.5"
GLM = "1"
NLopt = "0.5, 0.6"
PooledArrays = "0.5"
Expand Down
2 changes: 1 addition & 1 deletion src/MixedModels.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
module MixedModels

using Arrow
using BlockArrays
using DataAPI
using Distributions
using Feather
using GLM
using LinearAlgebra
using NLopt
Expand Down
3 changes: 2 additions & 1 deletion src/linearmixedmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ function LinearMixedModel(
y, Xs = modelcols(form, tbl)

y = reshape(float(y), (:, 1)) # y as a floating-point matrix
T = eltype(y)
T = promote_type(Float64, eltype(y)) # ensure that eltype of model matrices is at least Float64
y = convert(Matrix{T}, y)

reterms = AbstractReMat{T}[]
feterms = FeMat{T}[]
Expand Down
8 changes: 4 additions & 4 deletions src/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,20 +138,20 @@ function replicate(f::Function, n::Integer; use_threads=false)
results
end

cacheddatasets = Dict{String,Any}()
cacheddatasets = Dict{String, Arrow.Table}()
"""
dataset(nm)
Return the data frame of test data set named `nm`, which can be a `String` or `Symbol`
"""
function dataset(nm::AbstractString)
get!(cacheddatasets, nm) do
path = joinpath(TestData, nm * ".feather")
path = joinpath(TestData, nm * ".arrow")
if !isfile(path)
throw(ArgumentError(
"Dataset \"$nm\" is not available.\nUse MixedModels.datasets() for available names."))
end
Feather.read(path)
Arrow.Table(path)
end
end
dataset(nm::Symbol) = dataset(string(nm))
Expand All @@ -161,7 +161,7 @@ dataset(nm::Symbol) = dataset(string(nm))
Return a vector of names of the available test data sets
"""
datasets() = first.(Base.Filesystem.splitext.(filter(Base.Fix2(endswith, ".feather"), readdir(TestData))))
datasets() = first.(Base.Filesystem.splitext.(filter(endswith(".arrow"), readdir(TestData))))


"""
Expand Down
6 changes: 4 additions & 2 deletions test/FactorReTerm.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using DataFrames
using LinearAlgebra
using MixedModels
using Random
Expand Down Expand Up @@ -101,8 +102,9 @@ end
end

@testset "Categorical Blocking Variable" begin
# deepcopy because we're going to modify it
slp = deepcopy(dataset("sleepstudy"))
# deepcopy because we're going to modify it. Don't need the copy if dataset returns an Arrow.Table
#slp = deepcopy(DataFrame(dataset("sleepstudy")))
slp = DataFrame(dataset("sleepstudy"))
contrasts = Dict{Symbol,Any}()
f = @formula(reaction ~ 1 + (1|subj))

Expand Down
2 changes: 1 addition & 1 deletion test/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ end
@test sort(columntable(bsamp_threaded.β).β) == sort(columntable(bsamp.β).β)
@test sum(issingular(bsamp)) == sum(issingular(bsamp_threaded))
end
end
end
10 changes: 6 additions & 4 deletions test/missing.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
using MixedModels, Test
using DataFrames
using MixedModels
using Test

# deepcopy because we're going to modify it
slp = deepcopy(MixedModels.dataset(:sleepstudy))
slp[!,:days] = Array{Union{Missing, Float64},1}(slp[!,:days])
# convert to DataFrame to modify it
slp = DataFrame(MixedModels.dataset(:sleepstudy))
allowmissing!(slp, :days)
slp[1,:days] = missing

# TODO: re-enable this test when better missing support has landed in StatsModels
Expand Down
6 changes: 4 additions & 2 deletions test/pirls.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using MixedModels, Tables
using DataFrames
using MixedModels
using Tables
using Test

using MixedModels: dataset
Expand Down Expand Up @@ -81,7 +83,7 @@ end

@testset "grouseticks" begin
center(v::AbstractVector) = v .- (sum(v) / length(v))
grouseticks = dataset(:grouseticks)
grouseticks = DataFrame(dataset(:grouseticks))
grouseticks.ch = center(grouseticks.height)
gm4 = fit(MixedModel, only(gfms[:grouseticks]), grouseticks, Poisson(), fast=true) # fails in pirls! with fast=false
@test isapprox(deviance(gm4), 851.4046, atol=0.001)
Expand Down
9 changes: 5 additions & 4 deletions test/pls.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ include("modelcache.jl")
@test fm1.σ 49.510099986291145 atol=1.e-5
@test fm1.X == ones(30,1)
ds = MixedModels.dataset(:dyestuff)
@test fm1.y == ds[!, :yield]
@test fm1.y == ds[:yield]
@test cond(fm1) == ones(1)
@test first(leverage(fm1)) 0.15650534392640486 rtol=1.e-5
@test sum(leverage(fm1)) 4.695160317792145 rtol=1.e-5
cm = coeftable(fm1)
@test length(cm.rownms) == 1
@test length(cm.colnms) == 4
@test fnames(fm1) == (:batch,)
@test response(fm1) == ds[!, :yield]
@test response(fm1) == ds[:yield]
rfu = ranef(fm1, uscale = true)
rfb = ranef(fm1)
@test abs(sum(rfu[1])) < 1.e-5
Expand Down Expand Up @@ -127,7 +127,7 @@ end
@test coef(fm) [5.6656]
@test logdet(fm) 0.0
@test issingular(fm)
refit!(fm, float(MixedModels.dataset(:dyestuff)[!, :yield]))
refit!(fm, float(MixedModels.dataset(:dyestuff)[:yield]))
@test objective(fm) 327.3270598811428 atol=0.001
end

Expand Down Expand Up @@ -333,7 +333,7 @@ end
@test logdet(fm_ind) logdet(fmnc)

# combining [ReMat{T,S1}, ReMat{T,S2}] for S1 ≠ S2
slpcat = categorical!(deepcopy(slp), [:days])
slpcat = categorical!(DataFrame(slp), [:days])
fm_cat = fit(MixedModel, @formula(reaction ~ 1+days+(1|subj)+(0+days|subj)),slpcat)
@test fm_cat isa LinearMixedModel
σρ = fm_cat.σρs
Expand Down Expand Up @@ -391,6 +391,7 @@ end
end

@testset "kb07" begin
global io
pca = last(models(:kb07)).PCA
@test keys(pca) == (:subj, :item)
show(io, models(:kb07)[2])
Expand Down
7 changes: 5 additions & 2 deletions test/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,11 @@ end

@testset "datasets" begin
@test isa(MixedModels.datasets(), Vector{String})
@test size(MixedModels.dataset(:dyestuff)) == (30, 2)
@test size(MixedModels.dataset("dyestuff")) == (30, 2)
@test length(MixedModels.dataset(:dyestuff)) == 2
@test length(MixedModels.dataset("dyestuff")) == 2
dyestuff = MixedModels.dataset(:dyestuff);
@test keys(dyestuff) == [:batch, :yield]
@test length(dyestuff.batch) == 30
@test_throws ArgumentError MixedModels.dataset(:foo)
end

Expand Down

0 comments on commit b7402e2

Please sign in to comment.