Skip to content

Commit

Permalink
Merge pull request #4 from davidanthoff/new-excelreaders
Browse files Browse the repository at this point in the history
Incorporate table read from ExcelReaders
  • Loading branch information
davidanthoff authored Apr 30, 2018
2 parents d4c74c0 + 0afa383 commit bd0089e
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 17 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# ExcelFiles.jl v0.3.0
* Incorporate all table functionality from ExcelReaders.jl.
* Drop dependency on DataTables.jl and DataFrames.jl.

# ExcelFiles.jl v0.2.0
* Move to TableTraits.jl

Expand Down
10 changes: 5 additions & 5 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
julia 0.6
TableTraits 0.0.1
ExcelReaders 0.7.0
IteratorInterfaceExtensions 0.0.2
TableTraits 0.0.3
TableTraitsUtils
ExcelReaders 0.9.0
IterableTables 0.5.0
DataValues 0.1.0
DataTables 0.0.3
DataValues 0.3.3
FileIO 0.4.0
DataFrames 0.9.0
90 changes: 84 additions & 6 deletions src/ExcelFiles.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
module ExcelFiles


using ExcelReaders, TableTraits, IterableTables, DataValues, DataFrames
using ExcelReaders, IteratorInterfaceExtensions, TableTraits, DataValues,
TableTraitsUtils
import FileIO
import IterableTables

struct ExcelFile
filename::String
Expand All @@ -14,15 +16,91 @@ function load(f::FileIO.File{FileIO.format"Excel"}, range; keywords...)
return ExcelFile(f.filename, range, keywords)
end

TableTraits.isiterable(x::ExcelFile) = true
IteratorInterfaceExtensions.isiterable(x::ExcelFile) = true
TableTraits.isiterabletable(x::ExcelFile) = true

function TableTraits.getiterator(file::ExcelFile)
df = contains(file.range, "!") ? readxl(DataFrame, file.filename, file.range; file.keywords...) : readxlsheet(DataFrame, file.filename, file.range)
function gennames(n::Integer)
res = Vector{Symbol}(n)
for i in 1:n
res[i] = Symbol(@sprintf "x%d" i)
end
return res
end

function _readxl(file::ExcelReaders.ExcelFile, sheetname::AbstractString, startrow::Int, startcol::Int, endrow::Int, endcol::Int; header::Bool=true, colnames::Vector{Symbol}=Symbol[])
data = ExcelReaders.readxl_internal(file, sheetname, startrow, startcol, endrow, endcol)

nrow, ncol = size(data)

if length(colnames)==0
if header
headervec = data[1, :]
NAcol = map(i->isa(i, DataValues.DataValue) && DataValues.isna(i), headervec)
headervec[NAcol] = gennames(countnz(NAcol))

# This somewhat complicated conditional makes sure that column names
# that are integer numbers end up without an extra ".0" as their name
colnames = [isa(i, AbstractFloat) ? ( modf(i)[1]==0.0 ? Symbol(Int(i)) : Symbol(string(i)) ) : Symbol(i) for i in vec(headervec)]
else
colnames = gennames(ncol)
end
elseif length(colnames)!=ncol
error("Length of colnames must equal number of columns in selected range")
end

columns = Array{Any}(ncol)

for i=1:ncol
if header
vals = data[2:end,i]
else
vals = data[:,i]
end

# Check whether all non-NA values in this column
# are of the same type
type_of_el = length(vals)>0 ? typeof(vals[1]) : Any
for val=vals
type_of_el = promote_type(type_of_el, typeof(val))
end

it = getiterator(df)
if type_of_el <: DataValue
columns[i] = convert(DataValueArray, vals)

# TODO Check wether this hack is correct
for (j,v) in enumerate(columns[i])
if v isa DataValue && !DataValues.isna(v) && v[] isa DataValue
columns[i][j] = v[]
end
end
else
columns[i] = convert(Array{type_of_el}, vals)
end
end

return columns, colnames
end

function IteratorInterfaceExtensions.getiterator(file::ExcelFile)
column_data, col_names = if contains(file.range, "!")
excelfile = openxl(file.filename)

sheetname, startrow, startcol, endrow, endcol = ExcelReaders.convert_ref_to_sheet_row_col(file.range)

_readxl(excelfile, sheetname, startrow, startcol, endrow, endcol; file.keywords...)
else
excelfile = openxl(file.filename)
sheet = excelfile.workbook[:sheet_by_name](file.range)
startrow, startcol, endrow, endcol = ExcelReaders.convert_args_to_row_col(sheet; file.keywords...)

_readxl(excelfile, file.range, startrow, startcol, endrow, endcol; file.keywords...)
end

return create_tableiterator(column_data, col_names)
end

return it
function Base.collect(file::ExcelFile)
return collect(getiterator(file))
end

end # module
89 changes: 83 additions & 6 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,94 @@
using FileIO
using ExcelFiles
using TableTraits
using IterableTables
using DataFrames
using TableTraitsUtils
using Base.Test

@testset "ExcelFiles" begin

df = load(joinpath(Pkg.dir("ExcelReaders"), "test", "TestData.xlsx"), "Sheet1") |> DataFrame
filename = normpath(Pkg.dir("ExcelReaders"),"test", "TestData.xlsx")

@test size(df) == (4,13)

efile = load(joinpath(Pkg.dir("ExcelReaders"), "test", "TestData.xlsx"), "Sheet1")
efile = load(filename, "Sheet1")

@test isiterable(efile) == true

full_dfs = [create_columns_from_iterabletable(load(filename, "Sheet1!C3:O7")), create_columns_from_iterabletable(load(filename, "Sheet1"))]
for (df, names) in full_dfs
@test length(df) == 13
@test length(df[1]) == 4

@test df[1] == [1., 1.5, 2., 2.5]
@test df[2] == ["A", "BB", "CCC", "DDDD"]
@test df[3] == [true, false, false, true]
@test df[4] == [2, "EEEEE", false, 1.5]
@test df[5] == [9., "III", DataValues.NA, true]
@test df[6] == [3., DataValues.NA, 3.5, 4]
@test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
@test df[8] == [DataValues.NA, true, DataValues.NA, false]
@test df[9] == [Date(2015,3,3), DateTime(2015,2,4,10,14), Date(1988,4,9), Dates.Time(15,2,0)]
@test df[10] == [Date(1965,4,3), DateTime(1950,8,9,18,40), Dates.Time(19,0,0), DataValues.NA]
@test eltype(df[11]) == ExcelReaders.ExcelErrorCell
@test df[12][1] isa ExcelReaders.ExcelErrorCell
@test df[12][2] isa ExcelReaders.ExcelErrorCell
@test df[12][3] isa ExcelReaders.ExcelErrorCell
@test df[12][4] == DataValues.NA
@test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]
end

df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false))
@test names == [:x1,:x2,:x3,:x4,:x5,:x6,:x7,:x8,:x9,:x10,:x11,:x12,:x13]
@test length(df[1]) == 4
@test length(df) == 13
@test df[1] == [1., 1.5, 2., 2.5]
@test df[2] == ["A", "BB", "CCC", "DDDD"]
@test df[3] == [true, false, false, true]
@test df[4] == [2, "EEEEE", false, 1.5]
@test df[5] == [9., "III", DataValues.NA, true]
@test df[6] == [3, DataValues.NA, 3.5, 4]
@test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
@test df[8] == [DataValues.NA, true, DataValues.NA, false]
@test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15,2,0)]
@test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19,0,0), DataValues.NA]
@test isa(df[11][1], ExcelReaders.ExcelErrorCell)
@test isa(df[11][2], ExcelReaders.ExcelErrorCell)
@test isa(df[11][3], ExcelReaders.ExcelErrorCell)
@test isa(df[11][4], ExcelReaders.ExcelErrorCell)
@test isa(df[12][1], ExcelReaders.ExcelErrorCell)
@test isa(df[12][2], ExcelReaders.ExcelErrorCell)
@test isa(df[12][3], ExcelReaders.ExcelErrorCell)
@test DataValues.isna(df[12][4])
@test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]

good_colnames = [:c1, :c2, :c3, :c4, :c5, :c6, :c7, :c8, :c9, :c10, :c11, :c12, :c13]
df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false, colnames=good_colnames))
@test names == good_colnames
@test length(df[1]) == 4
@test length(df) == 13
@test df[1] == [1., 1.5, 2., 2.5]
@test df[2] == ["A", "BB", "CCC", "DDDD"]
@test df[3] == [true, false, false, true]
@test df[4] == [2, "EEEEE", false, 1.5]
@test df[5] == [9., "III", DataValues.NA, true]
@test df[6] == [3, DataValues.NA, 3.5, 4]
@test df[7] == ["FF", DataValues.NA, "GGG", "HHHH"]
@test df[8] == [DataValues.NA, true, DataValues.NA, false]
@test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15,2,0)]
@test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19,0,0), DataValues.NA]
@test isa(df[11][1], ExcelReaders.ExcelErrorCell)
@test isa(df[11][2], ExcelReaders.ExcelErrorCell)
@test isa(df[11][3], ExcelReaders.ExcelErrorCell)
@test isa(df[11][4], ExcelReaders.ExcelErrorCell)
@test isa(df[12][1], ExcelReaders.ExcelErrorCell)
@test isa(df[12][2], ExcelReaders.ExcelErrorCell)
@test isa(df[12][3], ExcelReaders.ExcelErrorCell)
@test DataValues.isna(df[12][4])
@test df[13] == [DataValues.NA, 3.4, "HKEJW", DataValues.NA]

# Too few colnames
@test_throws ErrorException create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=true, colnames=[:c1, :c2, :c3, :c4]))

# Test for constructing DataFrame with empty header cell
data, names = create_columns_from_iterabletable(load(filename, "Sheet2!C5:E7"))
@test names == [:Col1, :x1, :Col3]

end

0 comments on commit bd0089e

Please sign in to comment.