Merge pull request #4 from davidanthoff/new-excelreaders

Incorporate table read from ExcelReaders
queryverse · Apr 30, 2018 · bd0089e · bd0089e
2 parents d4c74c0 + 0afa383
commit bd0089e
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# ExcelFiles.jl v0.3.0
+* Incorporate all table functionality from ExcelReaders.jl.
+* Drop dependency on DataTables.jl and DataFrames.jl.
+
 # ExcelFiles.jl v0.2.0
 * Move to TableTraits.jl
 

diff --git a/REQUIRE b/REQUIRE
@@ -1,8 +1,8 @@
 julia 0.6
-TableTraits 0.0.1
-ExcelReaders 0.7.0
+IteratorInterfaceExtensions 0.0.2
+TableTraits 0.0.3
+TableTraitsUtils
+ExcelReaders 0.9.0
 IterableTables 0.5.0
-DataValues 0.1.0
-DataTables 0.0.3
+DataValues 0.3.3
 FileIO 0.4.0
-DataFrames 0.9.0
diff --git a/src/ExcelFiles.jl b/src/ExcelFiles.jl
@@ -1,8 +1,10 @@
 module ExcelFiles
 
 
-using ExcelReaders, TableTraits, IterableTables, DataValues, DataFrames
+using ExcelReaders, IteratorInterfaceExtensions, TableTraits, DataValues,
+    TableTraitsUtils
 import FileIO
+import IterableTables
 
 struct ExcelFile
     filename::String
@@ -14,15 +16,91 @@ function load(f::FileIO.File{FileIO.format"Excel"}, range; keywords...)
     return ExcelFile(f.filename, range, keywords)
 end
 
-TableTraits.isiterable(x::ExcelFile) = true
+IteratorInterfaceExtensions.isiterable(x::ExcelFile) = true
 TableTraits.isiterabletable(x::ExcelFile) = true
 
-function TableTraits.getiterator(file::ExcelFile)
-    df = contains(file.range, "!") ? readxl(DataFrame, file.filename, file.range; file.keywords...) : readxlsheet(DataFrame, file.filename, file.range)
+function gennames(n::Integer)
+    res = Vector{Symbol}(n)
+    for i in 1:n
+        res[i] = Symbol(@sprintf "x%d" i)
+    end
+    return res
+end
+
+function _readxl(file::ExcelReaders.ExcelFile, sheetname::AbstractString, startrow::Int, startcol::Int, endrow::Int, endcol::Int; header::Bool=true, colnames::Vector{Symbol}=Symbol[])
+    data = ExcelReaders.readxl_internal(file, sheetname, startrow, startcol, endrow, endcol)
+
+    nrow, ncol = size(data)
+
+    if length(colnames)==0
+        if header
+            headervec = data[1, :]
+            NAcol = map(i->isa(i, DataValues.DataValue) && DataValues.isna(i), headervec)
+            headervec[NAcol] = gennames(countnz(NAcol))
+
+            # This somewhat complicated conditional makes sure that column names
+            # that are integer numbers end up without an extra ".0" as their name
+            colnames = [isa(i, AbstractFloat) ? ( modf(i)[1]==0.0 ? Symbol(Int(i)) : Symbol(string(i)) ) : Symbol(i) for i in vec(headervec)]
+        else
+            colnames = gennames(ncol)
+        end
+    elseif length(colnames)!=ncol
+        error("Length of colnames must equal number of columns in selected range")
+    end
+
+    columns = Array{Any}(ncol)
+
+    for i=1:ncol
+        if header
+            vals = data[2:end,i]
+        else
+            vals = data[:,i]
+        end
+
+        # Check whether all non-NA values in this column
+        # are of the same type
+        type_of_el = length(vals)>0 ? typeof(vals[1]) : Any
+        for val=vals
+            type_of_el = promote_type(type_of_el, typeof(val))
+        end
 
-    it = getiterator(df)
+        if type_of_el <: DataValue
+            columns[i] = convert(DataValueArray, vals)
+
+            # TODO Check wether this hack is correct
+            for (j,v) in enumerate(columns[i])
+                if v isa DataValue && !DataValues.isna(v) && v[] isa DataValue
+                    columns[i][j] = v[]
+                end
+            end
+        else
+            columns[i] = convert(Array{type_of_el}, vals)
+        end
+    end
+
+    return columns, colnames
+end
+
+function IteratorInterfaceExtensions.getiterator(file::ExcelFile)
+    column_data, col_names = if contains(file.range, "!")
+        excelfile = openxl(file.filename)
+
+        sheetname, startrow, startcol, endrow, endcol = ExcelReaders.convert_ref_to_sheet_row_col(file.range)
+
+        _readxl(excelfile, sheetname, startrow, startcol, endrow, endcol; file.keywords...)
+    else
+        excelfile = openxl(file.filename)
+        sheet = excelfile.workbook[:sheet_by_name](file.range)
+        startrow, startcol, endrow, endcol = ExcelReaders.convert_args_to_row_col(sheet; file.keywords...)
+
+        _readxl(excelfile, file.range, startrow, startcol, endrow, endcol; file.keywords...)    
+    end
+
+    return create_tableiterator(column_data, col_names)
+end
 
-    return it
+function Base.collect(file::ExcelFile)
+    return collect(getiterator(file))
 end
 
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,17 +1,94 @@
 using FileIO
+using ExcelFiles
 using TableTraits
-using IterableTables
-using DataFrames
+using TableTraitsUtils
 using Base.Test
 
 @testset "ExcelFiles" begin
 
-df = load(joinpath(Pkg.dir("ExcelReaders"), "test", "TestData.xlsx"), "Sheet1") |> DataFrame
+filename = normpath(Pkg.dir("ExcelReaders"),"test", "TestData.xlsx")
 
-@test size(df) == (4,13)
-
-efile = load(joinpath(Pkg.dir("ExcelReaders"), "test", "TestData.xlsx"), "Sheet1")
+efile = load(filename, "Sheet1")
 
 @test isiterable(efile) == true
 
+full_dfs = [create_columns_from_iterabletable(load(filename, "Sheet1!C3:O7")), create_columns_from_iterabletable(load(filename, "Sheet1"))]
+for (df, names) in full_dfs
+    @test length(df) == 13
+    @test length(df[1]) == 4
+
+    @test df[1] == [1., 1.5, 2., 2.5]
+    @test df[2] == ["A", "BB", "CCC", "DDDD"]
+    @test df[3] == [true, false, false, true]
+    @test df[4] == [2, "EEEEE", false, 1.5]
+    @test df[5] == [9., "III", DataValues.NA, true]
+    @test df[6] == [3., DataValues.NA, 3.5, 4]
+    @test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
+    @test df[8] == [DataValues.NA, true, DataValues.NA, false]
+    @test df[9] == [Date(2015,3,3), DateTime(2015,2,4,10,14), Date(1988,4,9), Dates.Time(15,2,0)]
+    @test df[10] == [Date(1965,4,3), DateTime(1950,8,9,18,40), Dates.Time(19,0,0), DataValues.NA]
+    @test eltype(df[11]) == ExcelReaders.ExcelErrorCell
+    @test df[12][1] isa ExcelReaders.ExcelErrorCell
+    @test df[12][2] isa ExcelReaders.ExcelErrorCell
+    @test df[12][3] isa ExcelReaders.ExcelErrorCell
+    @test df[12][4] == DataValues.NA
+    @test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]
+end
+
+df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false))
+@test names == [:x1,:x2,:x3,:x4,:x5,:x6,:x7,:x8,:x9,:x10,:x11,:x12,:x13]
+@test length(df[1]) == 4
+@test length(df) == 13
+@test df[1] == [1., 1.5, 2., 2.5]
+@test df[2] == ["A", "BB", "CCC", "DDDD"]
+@test df[3] == [true, false, false, true]
+@test df[4] == [2, "EEEEE", false, 1.5]
+@test df[5] == [9., "III", DataValues.NA, true]
+@test df[6] == [3, DataValues.NA, 3.5, 4]
+@test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
+@test df[8] == [DataValues.NA, true, DataValues.NA, false]
+@test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15,2,0)]
+@test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19,0,0), DataValues.NA]
+@test isa(df[11][1], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][2], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][3], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][4], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][1], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][2], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][3], ExcelReaders.ExcelErrorCell)
+@test DataValues.isna(df[12][4])
+@test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]
+
+good_colnames = [:c1, :c2, :c3, :c4, :c5, :c6, :c7, :c8, :c9, :c10, :c11, :c12, :c13]
+df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false, colnames=good_colnames))
+@test names == good_colnames
+@test length(df[1]) == 4
+@test length(df) == 13
+@test df[1] == [1., 1.5, 2., 2.5]
+@test df[2] == ["A", "BB", "CCC", "DDDD"]
+@test df[3] == [true, false, false, true]
+@test df[4] == [2, "EEEEE", false, 1.5]
+@test df[5] == [9., "III", DataValues.NA, true]
+@test df[6] == [3, DataValues.NA, 3.5, 4]
+@test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
+@test df[8] == [DataValues.NA, true, DataValues.NA, false]
+@test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15,2,0)]
+@test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19,0,0), DataValues.NA]
+@test isa(df[11][1], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][2], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][3], ExcelReaders.ExcelErrorCell)
+@test isa(df[11][4], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][1], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][2], ExcelReaders.ExcelErrorCell)
+@test isa(df[12][3], ExcelReaders.ExcelErrorCell)
+@test DataValues.isna(df[12][4])
+@test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]
+
+# Too few colnames
+@test_throws ErrorException create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=true, colnames=[:c1, :c2, :c3, :c4]))
+
+# Test for constructing DataFrame with empty header cell
+data, names = create_columns_from_iterabletable(load(filename, "Sheet2!C5:E7"))
+@test names == [:Col1, :x1, :Col3]
+
 end