diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 085a685059..6b1d7f050b 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -170,12 +170,13 @@ wrap(s::Any) = DataFrame(x1 = s) Combine a GroupApplied object (rudimentary) ```julia -combine(ga::GroupApplied) +combine(ga::GroupApplied; append_keys::Bool=false) ``` ### Arguments * `ga` : a GroupApplied +* `append_keys` : whether or not to append the grouped by columns. ### Returns @@ -192,7 +193,7 @@ combine(map(d -> mean(skipmissing(d[:c])), gd)) ``` """ -function combine(ga::GroupApplied) +function combine(ga::GroupApplied; append_keys::Bool=false) gd, vals = ga.gd, ga.vals valscat = _vcat(vals) idx = Vector{Int}(undef, size(valscat, 1)) @@ -202,7 +203,7 @@ function combine(ga::GroupApplied) idx[j .+ (1:n)] .= gd.idx[start] j += n end - hcat!(gd.parent[idx, gd.cols], valscat) + append_keys ? hcat!(gd.parent[idx, gd.cols], valscat) : valscat end @@ -250,8 +251,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d` based on columns `col` ```julia -by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) -by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) +by(d::AbstractDataFrame, cols, f::Function; sort::Bool=false, append_keys::Bool=false) +by(f::Function, d::AbstractDataFrame, cols; sort::Bool=false, append_keys::Bool=false) ``` ### Arguments @@ -261,6 +262,7 @@ by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) * `f` : a function to be applied to groups; expects each argument to be an AbstractDataFrame * `sort`: sort row groups (no sorting by default) +* `append_keys`: whether or not to include the columns that the DataFrame was grouped by `f` can return a value, a vector, or a DataFrame. For a value or vector, these are merged into a column along with the `cols` keys. For @@ -293,10 +295,10 @@ end ``` """ -by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) = - combine(map(f, groupby(d, cols, sort = sort))) -by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) = - by(d, cols, f, sort = sort) +by(d::AbstractDataFrame, cols, f::Function; sort::Bool=false, append_keys::Bool=false) = + combine(map(f, groupby(d, cols, sort=sort)), append_keys=append_keys) +by(f::Function, d::AbstractDataFrame, cols; sort::Bool=false, append_keys::Bool=false) = + by(d, cols, f, sort=sort, append_keys=append_keys) # # Aggregate convenience functions @@ -349,7 +351,7 @@ end aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort) function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function headers = _makeheaders(fs, setdiff(_names(gd), _names(gd.parent[gd.cols]))) - res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) + res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd), append_keys=true) sort && sort!(res, headers) res end diff --git a/test/grouping.jl b/test/grouping.jl index be2643ad82..900c9b68c8 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -92,30 +92,30 @@ module TestGrouping sdf = unique(df[cols]) # by() without groups sorting - bdf = by(df, cols, f) + bdf = by(df, cols, f, append_keys=true) @test bdf[cols] == sdf # by() with groups sorting - sbdf = by(df, cols, f, sort=true) + sbdf = by(df, cols, f, sort=true, append_keys=true) @test sbdf[cols] == sort(sdf) - byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b]))) + byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b])), append_keys=true) # groupby() without groups sorting gd = groupby(df, cols) ga = map(f, gd) - @test bdf == combine(ga) + @test bdf == combine(ga, append_keys=true) # groupby() with groups sorting gd = groupby(df, cols, sort=true) ga = map(f, gd) - @test sbdf == combine(ga) + @test sbdf == combine(ga, append_keys=true) g(df) = DataFrame(cmax1 = [c + 1 for c in df[:cmax]]) h(df) = g(f(df)) - @test combine(map(h, gd)) == combine(map(g, ga)) + @test combine(map(h, gd), append_keys=true) == combine(map(g, ga), append_keys=true) # testing pool overflow df2 = DataFrame(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000))) @@ -142,7 +142,7 @@ module TestGrouping df = DataFrame(v1=x, v2=x) groupby(df, [:v1, :v2]) - df2 = by(e->1, DataFrame(x=Int64[]), :x) + df2 = by(e->1, DataFrame(x=Int64[]), :x, append_keys=true) @test size(df2) == (0, 1) @test sum(df2[:x]) == 0