diff --git a/docs/make.jl b/docs/make.jl index 43c18440d4..991ff07528 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -19,16 +19,11 @@ makedocs( "Reshaping" => "man/reshaping_and_pivoting.md", "Sorting" => "man/sorting.md", "Categorical Data" => "man/categorical.md", - "Querying frameworks" => "man/querying_frameworks.md", + "Querying frameworks" => "man/querying_frameworks.md" ], "API" => Any[ - "Main types" => "lib/maintypes.md", - "Utilities" => "lib/utilities.md", - "Data manipulation" => "lib/manipulation.md", - ], - "About" => Any[ - "Release Notes" => "NEWS.md", - "License" => "LICENSE.md", + "Types" => "lib/types.md", + "Functions" => "lib/functions.md" ] ] ) diff --git a/docs/src/LICENSE.md b/docs/src/LICENSE.md deleted file mode 100644 index 2c7be2a321..0000000000 --- a/docs/src/LICENSE.md +++ /dev/null @@ -1,23 +0,0 @@ -DataFrames.jl is licensed under the MIT License: - -> Copyright (c) 2012-2015: Harlan Harris, EPRI (Tom Short's code), Chris DuBois, -> John Myles White, and other contributors. -> -> Permission is hereby granted, free of charge, to any person obtaining -> a copy of this software and associated documentation files (the -> "Software"), to deal in the Software without restriction, including -> without limitation the rights to use, copy, modify, merge, publish, -> distribute, sublicense, and/or sell copies of the Software, and to -> permit persons to whom the Software is furnished to do so, subject to -> the following conditions: -> -> The above copyright notice and this permission notice shall be -> included in all copies or substantial portions of the Software. -> -> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -> NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -> LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -> OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -> WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/docs/src/NEWS.md b/docs/src/NEWS.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/src/index.md b/docs/src/index.md index b6fe7e1dc9..27de73e917 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,21 +1,40 @@ -# DataFrames Documentation Outline +# DataFrames.jl + +Welcome to the DataFrames documentation! This resource aims to teach you everything you need +to know to get up and running with tabular data manipulation using the DataFrames.jl package +and the Julia language. If there is something you expect DataFrames to be capable of, but +cannot figure out how to do, please reach out with questions in Domains/Data on +[Discourse](https://discourse.julialang.org/new-topic?title=[DataFrames%20Question]:%20&body=%23%20Question:%0A%0A%23%20Dataset%20(if%20applicable):%0A%0A%23%20Minimal%20Working%20Example%20(if%20applicable):%0A&category=Domains/Data&tags=question). +Please report bugs by +[opening an issue](https://github.com/JuliaData/DataFrames.jl/issues/new). You can follow +the [**source**]() links throughout the documentation to jump right to the +source files on GitHub to make pull requests for improving the documentation and function +capabilities. Please review +[DataFrames contributing guidelines](https://github.com/JuliaData/DataFrames.jl/blob/master/CONTRIBUTING.md) +before submitting your first PR! Information on specific versions can be found on the [Release page](https://github.com/JuliaData/DataFrames.jl/releases). ## Package Manual ```@contents -Pages = ["man/getting_started.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/categorical.md", "man/querying_frameworks.md"] +Pages = ["man/getting_started.md", + "man/joins.md", + "man/split_apply_combine.md", + "man/reshaping_and_pivoting.md", + "man/sorting.md", + "man/categorical.md", + "man/querying_frameworks.md"] Depth = 2 ``` ## API ```@contents -Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"] +Pages = ["lib/types.md", "lib/functions.md"] Depth = 2 ``` -## Documentation Index +## Index ```@index -Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"] +Pages = ["lib/types.md", "lib/functions.md"] ``` diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md new file mode 100644 index 0000000000..6e2d7a4d3e --- /dev/null +++ b/docs/src/lib/functions.md @@ -0,0 +1,54 @@ +```@meta +CurrentModule = DataFrames +``` + +# Functions + +```@index +Pages = ["functions.md"] +``` + +## Grouping, Joining, and Split-Apply-Combine + +```@docs +aggregate +by +colwise +groupby +join +melt +stack +unstack +stackdf +meltdf +``` + +## Basics + +```@docs +categorical! +combine +completecases +deleterows! +describe +dropnull +dropnull! +eachcol +eachrow +eltypes +head +names +names! +nonunique +nullable! +order +rename! +rename +show +showcols +size +sort +sort! +tail +unique! +``` diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md deleted file mode 100644 index 1f9f578d25..0000000000 --- a/docs/src/lib/manipulation.md +++ /dev/null @@ -1,25 +0,0 @@ -```@meta -CurrentModule = DataFrames -``` - -# Data Manipulation - -```@index -Pages = ["manipulation.md"] -``` - -## Joins - -```@docs -join -``` - -## Reshaping - -```@docs -melt -stack -unstack -stackdf -meltdf -``` diff --git a/docs/src/lib/maintypes.md b/docs/src/lib/types.md similarity index 60% rename from docs/src/lib/maintypes.md rename to docs/src/lib/types.md index ccc62d530c..a9e1ac3760 100644 --- a/docs/src/lib/maintypes.md +++ b/docs/src/lib/types.md @@ -3,14 +3,17 @@ CurrentModule = DataFrames ``` -# Main Types +# Types ```@index -Pages = ["maintypes.md"] +Pages = ["types.md"] ``` ```@docs AbstractDataFrame DataFrame +DataFrameRow +GroupApplied +GroupedDataFrame SubDataFrame ``` diff --git a/docs/src/lib/utilities.md b/docs/src/lib/utilities.md deleted file mode 100644 index d0439fab90..0000000000 --- a/docs/src/lib/utilities.md +++ /dev/null @@ -1,26 +0,0 @@ -```@meta -CurrentModule = DataFrames -``` - -# Utilities - -```@index -Pages = ["utilities.md"] -``` - -```@docs -eltypes -head -completecases -describe -dropnull -dropnull! -dump -names! -nonunique -rename -rename! -tail -unique -unique! -``` diff --git a/docs/src/man/categorical.md b/docs/src/man/categorical.md index ac41fa800e..85d9774ca4 100644 --- a/docs/src/man/categorical.md +++ b/docs/src/man/categorical.md @@ -2,52 +2,151 @@ Often, we have to deal with factors that take on a small number of levels: -```julia -v = ["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"] +```jldoctest categorical +julia> v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"] +6-element Array{String,1}: + "Group A" + "Group A" + "Group A" + "Group B" + "Group B" + "Group B" + ``` The naive encoding used in an `Array` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `CategoricalArray` type does: -```julia -cv = CategoricalArray(["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"]) +```jldoctest categorical +julia> using CategoricalArrays + +julia> cv = CategoricalArray(v) +6-element CategoricalArrays.CategoricalArray{String,1,UInt32}: + "Group A" + "Group A" + "Group A" + "Group B" + "Group B" + "Group B" + ``` `CategoricalArrays` support missing values via the `Nulls` package. -```julia -using Nulls -cv = CategoricalArray(["Group A", null, "Group A", - "Group B", "Group B", null]) +```jldoctest categorical +julia> using Nulls + +julia> cv = CategoricalArray(["Group A", null, "Group A", + "Group B", "Group B", null]) +6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt32}: + "Group A" + null + "Group A" + "Group B" + "Group B" + null ``` In addition to representing repeated data efficiently, the `CategoricalArray` type allows us to determine efficiently the allowed levels of the variable at any time using the `levels` function (note that levels may or may not be actually used in the data): -```julia -levels(cv) +```jldoctest categorical +julia> levels(cv) +2-element Array{String,1}: + "Group A" + "Group B" + ``` The `levels!` function also allows changing the order of appearance of the levels, which can be useful for display purposes or when working with ordered variables. -By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function: +```jldoctest categorical +julia> levels!(cv, ["Group B", "Group A"]); + +julia> levels(cv) +2-element Array{String,1}: + "Group B" + "Group A" + +julia> sort(cv) +6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt32}: + "Group B" + "Group B" + "Group A" + "Group A" + null + null -```julia -cv = compact(cv) ``` -Often, you will have factors encoded inside a DataFrame with `Array` columns instead of `CategoricalArray` columns. You can do conversion of a single column using the `categorical` function: +By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compress` function: + +```jldoctest categorical +julia> cv = compress(cv) +6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt8}: + "Group A" + null + "Group A" + "Group B" + "Group B" + null -```julia -cv = categorical(v) ``` -Or you can edit the columns of a `DataFrame` in-place using the `categorical!` function: +Often, you will have factors encoded inside a DataFrame with `Array` columns instead of +`CategoricalArray` columns. You can convert one or more columns of the DataFrame using the +`categorical!` function, which modifies the input DataFrame in-place. + +```jldoctest categorical +julia> using DataFrames + +julia> df = DataFrame(A = ["A", "B", "C", "D", "D", "A"], + B = ["X", "X", "X", "Y", "Y", "Y"]) +6×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ A │ X │ +│ 2 │ B │ X │ +│ 3 │ C │ X │ +│ 4 │ D │ Y │ +│ 5 │ D │ Y │ +│ 6 │ A │ Y │ + +julia> eltypes(df) +2-element Array{Type,1}: + String + String + +julia> categorical!(df, :A) # change the column `:A` to be categorical +6×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ A │ X │ +│ 2 │ B │ X │ +│ 3 │ C │ X │ +│ 4 │ D │ Y │ +│ 5 │ D │ Y │ +│ 6 │ A │ Y │ + +julia> eltypes(df) +2-element Array{Type,1}: + CategoricalArrays.CategoricalString{UInt32} + String + +julia> categorical!(df) # change all columns to be categorical +6×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ A │ X │ +│ 2 │ B │ X │ +│ 3 │ C │ X │ +│ 4 │ D │ Y │ +│ 5 │ D │ Y │ +│ 6 │ A │ Y │ + +julia> eltypes(df) +2-element Array{Type,1}: + CategoricalArrays.CategoricalString{UInt32} + CategoricalArrays.CategoricalString{UInt32} -```julia -df = DataFrame(A = [1, 1, 1, 2, 2, 2], - B = ["X", "X", "X", "Y", "Y", "Y"]) -categorical!(df, [:A, :B]) ``` Using categorical arrays is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl). When fitting regression models, `CategoricalArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `CategoricalArray`. This allows one to analyze categorical data efficiently. diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 10b0491917..63dc7cad69 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -13,7 +13,7 @@ Throughout the rest of this tutorial, we will assume that you have installed the To get started, let's examine the `Null` type. `Null` is a type implemented by the [Nulls.jl](https://github.com/JuliaData/Nulls.jl) package to represent missing data. `null` is an instance of the type `Null` used to represent a missing value. -```julia +```jldoctest nulls julia> using DataFrames julia> null @@ -26,7 +26,7 @@ Nulls.Null The `Null` type lets users create `Vector`s and `DataFrame` columns with missing values. Here we create a vector with a null value and the element-type of the returned vector is `Union{Nulls.Null, Int64}`. -```julia +```jldoctest nulls julia> x = [1, 2, null] 3-element Array{Union{Nulls.Null, Int64},1}: 1 @@ -46,15 +46,15 @@ true `null` values can be excluded when performing operations by using `Nulls.skip`, which returns a memory-efficient iterator. -```julia +```jldoctest nulls julia> Nulls.skip(x) -Base.Generator{Base.Iterators.Filter{Nulls.##4#6{Nulls.Null},Array{Union{Nulls.Null, Int64},1}},Nulls.##3#5}(Nulls.#3, Base.Iterators.Filter{Nulls.##4#6{Nulls.Null},Array{Union{Nulls.Null, Int64},1}}(Nulls.#4, Union{Nulls.Null, Int64}[1, 2, null])) +Base.Generator{Base.Iterators.Filter{Nulls.##4#6,Array{Union{Int64, Nulls.Null},1}},Nulls.##3#5}(Nulls.#3, Base.Iterators.Filter{Nulls.##4#6,Array{Union{Int64, Nulls.Null},1}}(Nulls.#4, Union{Int64, Nulls.Null}[1, 2, null])) ``` The output of `Nulls.skip` can be passed directly into functions as an argument. For example, we can find the `sum` of all non-null values or `collect` the non-null values into a new null-free vector. -```julia +```jldoctest nulls julia> sum(Nulls.skip(x)) 3 @@ -67,7 +67,7 @@ julia> collect(Nulls.skip(x)) `null` elements can be replaced with other values via `Nulls.replace`. -```julia +```jldoctest nulls julia> collect(Nulls.replace(x, 1)) 3-element Array{Int64,1}: 1 @@ -78,7 +78,10 @@ julia> collect(Nulls.replace(x, 1)) The function `Nulls.T` returns the element-type `T` in `Union{T, Null}`. -```julia +```jldoctest nulls +julia> eltype(x) +Union{Int64, Nulls.Null} + julia> Nulls.T(eltype(x)) Int64 @@ -86,7 +89,7 @@ Int64 Use `nulls` to generate nullable `Vector`s and `Array`s, using the optional first argument to specify the element-type. -```julia +```jldoctest nulls julia> nulls(1) 1-element Array{Nulls.Null,1}: null @@ -111,61 +114,185 @@ julia> nulls(Int, 1, 3) The `DataFrame` type can be used to represent data tables, each column of which is a vector. You can specify the columns using keyword arguments or pairs: -```julia -df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) -df = DataFrame(:A => 1:4, :B => ["M", "F", "F", "M"]) +```jldoctest dataframe +julia> using DataFrames + +julia> DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) +4×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ M │ +│ 2 │ 2 │ F │ +│ 3 │ 3 │ F │ +│ 4 │ 4 │ M │ + ``` It is also possible to construct a `DataFrame` in stages: -```julia -df = DataFrame() -df[:A] = 1:8 -df[:B] = ["M", "F", "F", "M", "F", "M", "M", "F"] -df +```jldoctest dataframe +julia> df = DataFrame() +0×0 DataFrames.DataFrame + + +julia> df[:A] = 1:8 +1:8 + +julia> df[:B] = ["M", "F", "F", "M", "F", "M", "M", "F"] +8-element Array{String,1}: + "M" + "F" + "F" + "M" + "F" + "M" + "M" + "F" + +julia> df +8×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ M │ +│ 2 │ 2 │ F │ +│ 3 │ 3 │ F │ +│ 4 │ 4 │ M │ +│ 5 │ 5 │ F │ +│ 6 │ 6 │ M │ +│ 7 │ 7 │ M │ +│ 8 │ 8 │ F │ + ``` -The `DataFrame` we build in this way has 8 rows and 2 columns. You can check this using `size` function: +The `DataFrame` we build in this way has 8 rows and 2 columns. You can check this using the +`size` function: + +```jldoctest dataframe +julia> size(df, 1) == 8 +true + +julia> size(df, 2) == 2 +true + +julia> size(df) == (8, 2) +true -```julia -nrows = size(df, 1) -ncols = size(df, 2) ``` We can also look at small subsets of the data in a couple of different ways: -```julia -head(df) -tail(df) +```jldoctest dataframe +julia> head(df) +6×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ M │ +│ 2 │ 2 │ F │ +│ 3 │ 3 │ F │ +│ 4 │ 4 │ M │ +│ 5 │ 5 │ F │ +│ 6 │ 6 │ M │ + +julia> tail(df) +6×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 3 │ F │ +│ 2 │ 4 │ M │ +│ 3 │ 5 │ F │ +│ 4 │ 6 │ M │ +│ 5 │ 7 │ M │ +│ 6 │ 8 │ F │ + +julia> df[1:3, :] +3×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ M │ +│ 2 │ 2 │ F │ +│ 3 │ 3 │ F │ -df[1:3, :] ``` Having seen what some of the rows look like, we can try to summarize the entire data set using `describe`: -```julia -describe(df) +```jldoctest dataframe +julia> describe(df) +A +Summary Stats: +Mean: 4.500000 +Minimum: 1.000000 +1st Quartile: 2.750000 +Median: 4.500000 +3rd Quartile: 6.250000 +Maximum: 8.000000 +Length: 8 +Type: Int64 + +B +Summary Stats: +Length: 8 +Type: String +Number Unique: 2 + + ``` -To focus our search, we start looking at just the means and medians of specific columns. In the example below, we use numeric indexing to access the columns of the `DataFrame`: +To access individual columns of the dataset, you refer to the column names by their symbol +or by their numerical index. Here we extract the first column, `:A`, and use it to compute +the mean and variance. + +```jldoctest dataframe +julia> mean(df[:A]) == mean(df[1]) == 4.5 +true + +julia> var(df[:A]) == var(df[1]) == 6.0 +true -```julia -mean(Nulls.skip(df[1])) -median(Nulls.skip(df[1])) ``` -We could also have used column names to access individual columns: +If your dataset has missing values, most functions will require you to remove them +beforehand. Here we will replace all odd-numbered rows in the first column with missing data +to show how to handle the above example when missing values are present in your dataset. + +```jldoctest dataframe +julia> df[:A] = [isodd(i) ? null : value for (i, value) in enumerate(df[:A])]; + +julia> df +8×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼──────┼───┤ +│ 1 │ null │ M │ +│ 2 │ 2 │ F │ +│ 3 │ null │ F │ +│ 4 │ 4 │ M │ +│ 5 │ null │ F │ +│ 6 │ 6 │ M │ +│ 7 │ null │ M │ +│ 8 │ 8 │ F │ + +julia> mean(Nulls.skip(df[:A])) +5.0 -```julia -mean(Nulls.skip(df[:A])) -median(Nulls.skip(df[:A])) ``` We can also apply a function to each column of a `DataFrame` with the `colwise` function. For example: -```julia -df = DataFrame(A = 1:4, B = randn(4)) -colwise(c->cumsum(Nulls.skip(c)), df) +```jldoctest dataframe +julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) +4×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼─────┤ +│ 1 │ 1 │ 4.0 │ +│ 2 │ 2 │ 3.0 │ +│ 3 │ 3 │ 2.0 │ +│ 4 │ 4 │ 1.0 │ + +julia> colwise(sum, df) +2-element Array{Real,1}: + 10 + 10.0 + ``` ## Importing and Exporting Data (I/O) @@ -199,10 +326,22 @@ The behavior of CSV functions can be adapted via keyword arguments. For more inf To see more of the functionality for working with `DataFrame` objects, we need a more complex data set to work with. We can access Fisher's iris data set using the following functions: -```julia -using CSV -iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")) -head(iris) +```jldoctest csv +julia> using DataFrames, CSV + +julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │ +│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │ +│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ +│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │ + ``` ## Querying DataFrames diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md index 94c7e8452e..aca3848b89 100644 --- a/docs/src/man/joins.md +++ b/docs/src/man/joins.md @@ -2,23 +2,36 @@ We often need to combine two or more data sets together to provide a complete picture of the topic we are studying. For example, suppose that we have the following two data sets: -```julia -names = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"]) -jobs = DataFrame(ID = [20, 40], Job = ["Lawyer", "Doctor"]) +```jldoctest joins +julia> using DataFrames + +julia> names = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"]) +2×2 DataFrames.DataFrame +│ Row │ ID │ Name │ +├─────┼────┼──────────┤ +│ 1 │ 20 │ John Doe │ +│ 2 │ 40 │ Jane Doe │ + +julia> jobs = DataFrame(ID = [20, 40], Job = ["Lawyer", "Doctor"]) +2×2 DataFrames.DataFrame +│ Row │ ID │ Job │ +├─────┼────┼────────┤ +│ 1 │ 20 │ Lawyer │ +│ 2 │ 40 │ Doctor │ + ``` We might want to work with a larger data set that contains both the names and jobs for each ID. We can do this using the `join` function: -```julia -full = join(names, jobs, on = :ID) -``` - -Output: +```jldoctest joins +julia> join(names, jobs, on = :ID) +2×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ +│ 2 │ 40 │ Jane Doe │ Doctor │ -| Row | ID | Name | Job | -|-----|----|------------|----------| -| 1 | 20 | "John Doe" | "Lawyer" | -| 2 | 40 | "Jane Doe" | "Doctor" | +``` In relational database theory, this operation is generally referred to as a join. The columns used to determine which rows should be combined during a join are called keys. @@ -32,43 +45,156 @@ There are seven kinds of joins supported by the DataFrames package: - Anti: The output contains rows for values of the key that exist in the first (left) but not the second (right) argument to `join`. As with semi joins, output is restricted to columns from the first (left) argument. - Cross: The output is the cartesian product of rows from the first (left) and second (right) arguments to `join`. +See [the Wikipedia page on SQL joins](https://en.wikipedia.org/wiki/Join_(SQL)) for more information. + You can control the kind of join that `join` performs using the `kind` keyword argument: -```julia -a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"]) -b = DataFrame(ID = [20, 60], Job = ["Lawyer", "Astronaut"]) -join(a, b, on = :ID, kind = :inner) -join(a, b, on = :ID, kind = :left) -join(a, b, on = :ID, kind = :right) -join(a, b, on = :ID, kind = :outer) -join(a, b, on = :ID, kind = :semi) -join(a, b, on = :ID, kind = :anti) +```jldoctest joins +julia> jobs = DataFrame(ID = [20, 60], Job = ["Lawyer", "Astronaut"]) +2×2 DataFrames.DataFrame +│ Row │ ID │ Job │ +├─────┼────┼───────────┤ +│ 1 │ 20 │ Lawyer │ +│ 2 │ 60 │ Astronaut │ + +julia> join(names, jobs, on = :ID, kind = :inner) +1×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ + +julia> join(names, jobs, on = :ID, kind = :left) +2×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ +│ 2 │ 40 │ Jane Doe │ null │ + +julia> join(names, jobs, on = :ID, kind = :right) +2×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼───────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ +│ 2 │ 60 │ null │ Astronaut │ + +julia> join(names, jobs, on = :ID, kind = :outer) +3×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼───────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ +│ 2 │ 40 │ Jane Doe │ null │ +│ 3 │ 60 │ null │ Astronaut │ + +julia> join(names, jobs, on = :ID, kind = :semi) +1×2 DataFrames.DataFrame +│ Row │ ID │ Name │ +├─────┼────┼──────────┤ +│ 1 │ 20 │ John Doe │ + +julia> join(names, jobs, on = :ID, kind = :anti) +1×2 DataFrames.DataFrame +│ Row │ ID │ Name │ +├─────┼────┼──────────┤ +│ 1 │ 40 │ Jane Doe │ + ``` Cross joins are the only kind of join that does not use a key: -```julia -join(a, b, kind = :cross) +```jldoctest joins +julia> join(names, jobs, kind = :cross) +4×4 DataFrames.DataFrame +│ Row │ ID │ Name │ ID_1 │ Job │ +├─────┼────┼──────────┼──────┼───────────┤ +│ 1 │ 20 │ John Doe │ 20 │ Lawyer │ +│ 2 │ 20 │ John Doe │ 60 │ Astronaut │ +│ 3 │ 40 │ Jane Doe │ 20 │ Lawyer │ +│ 4 │ 40 │ Jane Doe │ 60 │ Astronaut │ + ``` In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!: -```julia -a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"]) -b = DataFrame(IDNew = [20, 40], Job = ["Lawyer", "Doctor"]) -rename!(b, :IDNew, :ID) -join(a, b, on = :ID, kind = :inner) +```jldoctest joins +julia> a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"]) +2×2 DataFrames.DataFrame +│ Row │ ID │ Name │ +├─────┼────┼──────────┤ +│ 1 │ 20 │ John Doe │ +│ 2 │ 40 │ Jane Doe │ + +julia> b = DataFrame(IDNew = [20, 40], Job = ["Lawyer", "Doctor"]) +2×2 DataFrames.DataFrame +│ Row │ IDNew │ Job │ +├─────┼───────┼────────┤ +│ 1 │ 20 │ Lawyer │ +│ 2 │ 40 │ Doctor │ + +julia> rename!(b, :IDNew, :ID) +2×2 DataFrames.DataFrame +│ Row │ ID │ Job │ +├─────┼────┼────────┤ +│ 1 │ 20 │ Lawyer │ +│ 2 │ 40 │ Doctor │ + +julia> join(a, b, on = :ID, kind = :inner) +2×3 DataFrames.DataFrame +│ Row │ ID │ Name │ Job │ +├─────┼────┼──────────┼────────┤ +│ 1 │ 20 │ John Doe │ Lawyer │ +│ 2 │ 40 │ Jane Doe │ Doctor │ + ``` Or renaming multiple columns at a time: -```julia -a = DataFrame(City = ["Amsterdam", "London", "London", "New York", "New York"], - Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], - Category = [1, 2, 3, 4, 5]) -b = DataFrame(Location = ["Amsterdam", "London", "London", "New York", "New York"], - Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], - Name = ["a", "b", "c", "d", "e"]) -rename!(b, [:Location => :City, :Work => :Job]) -join(a, b, on = [:City, :Job]) +```jldoctest joins +julia> a = DataFrame(City = ["Amsterdam", "London", "London", "New York", "New York"], + Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], + Category = [1, 2, 3, 4, 5]) +5×3 DataFrames.DataFrame +│ Row │ City │ Job │ Category │ +├─────┼───────────┼────────┼──────────┤ +│ 1 │ Amsterdam │ Lawyer │ 1 │ +│ 2 │ London │ Lawyer │ 2 │ +│ 3 │ London │ Lawyer │ 3 │ +│ 4 │ New York │ Doctor │ 4 │ +│ 5 │ New York │ Doctor │ 5 │ + +julia> b = DataFrame(Location = ["Amsterdam", "London", "London", "New York", "New York"], + Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], + Name = ["a", "b", "c", "d", "e"]) +5×3 DataFrames.DataFrame +│ Row │ Location │ Work │ Name │ +├─────┼───────────┼────────┼──────┤ +│ 1 │ Amsterdam │ Lawyer │ a │ +│ 2 │ London │ Lawyer │ b │ +│ 3 │ London │ Lawyer │ c │ +│ 4 │ New York │ Doctor │ d │ +│ 5 │ New York │ Doctor │ e │ + +julia> rename!(b, [:Location => :City, :Work => :Job]) +5×3 DataFrames.DataFrame +│ Row │ City │ Job │ Name │ +├─────┼───────────┼────────┼──────┤ +│ 1 │ Amsterdam │ Lawyer │ a │ +│ 2 │ London │ Lawyer │ b │ +│ 3 │ London │ Lawyer │ c │ +│ 4 │ New York │ Doctor │ d │ +│ 5 │ New York │ Doctor │ e │ + +julia> join(a, b, on = [:City, :Job]) +9×4 DataFrames.DataFrame +│ Row │ City │ Job │ Category │ Name │ +├─────┼───────────┼────────┼──────────┼──────┤ +│ 1 │ Amsterdam │ Lawyer │ 1 │ a │ +│ 2 │ London │ Lawyer │ 2 │ b │ +│ 3 │ London │ Lawyer │ 2 │ c │ +│ 4 │ London │ Lawyer │ 3 │ b │ +│ 5 │ London │ Lawyer │ 3 │ c │ +│ 6 │ New York │ Doctor │ 4 │ d │ +│ 7 │ New York │ Doctor │ 4 │ e │ +│ 8 │ New York │ Doctor │ 5 │ d │ +│ 9 │ New York │ Doctor │ 5 │ e │ + ``` diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md index 398d03a373..d883281011 100644 --- a/docs/src/man/querying_frameworks.md +++ b/docs/src/man/querying_frameworks.md @@ -12,59 +12,77 @@ A query is started with the `@from` macro and consists of a series of query comm A simple example of a query looks like this: -```@setup 1 -using DataFrames, Query -``` - -```@example 1 -using DataFrames, Query +```jldoctest query +julia> using DataFrames, Query + +julia> df = DataFrame(name=["John", "Sally", "Roger"], age=[54., 34., 79.], children=[0, 2, 4]) +3×3 DataFrames.DataFrame +│ Row │ name │ age │ children │ +├─────┼───────┼──────┼──────────┤ +│ 1 │ John │ 54.0 │ 0 │ +│ 2 │ Sally │ 34.0 │ 2 │ +│ 3 │ Roger │ 79.0 │ 4 │ + +julia> q1 = @from i in df begin + @where i.age > 40 + @select {number_of_children=i.children, i.name} + @collect DataFrame + end +2×2 DataFrames.DataFrame +│ Row │ number_of_children │ name │ +├─────┼────────────────────┼───────┤ +│ 1 │ 0 │ John │ +│ 2 │ 4 │ Roger │ -df = DataFrame(name=["John", "Sally", "Roger"], age=[54., 34., 79.], children=[0, 2, 4]) - -q1 = @from i in df begin - @where i.age > 40 - @select {number_of_children=i.children, i.name} - @collect DataFrame -end ``` The query starts with the `@from` macro. The first argument `i` is the name of the range variable that will be used to refer to an individual row in later query commands. The next argument `df` is the data source that one wants to query. The `@where` command in this query will filter the source data by applying the filter condition `i.age > 40`. This filters out any rows in which the `age` column is not larger than 40. The `@select` command then projects the columns of the source data onto a new column structure. The example here applies three specific modifications: 1) it only keeps a subset of the columns in the source `DataFrame`, i.e. the `age` column will not be part of the transformed data; 2) it changes the order of the two columns that are selected; and 3) it renames one of the columns that is selected from `children` to `number_of_children`. The example query uses the `{}` syntax to achieve this. A `{}` in a Query.jl expression instantiates a new [NamedTuple](https://github.com/blackrock/NamedTuples.jl), i.e. it is a shortcut for writing `@NT(number_of_children=>i.children, name=>i.name)`. The `@collect` statement determines the data structure that the query returns. In this example the results are returned as a `DataFrame`. A query without a `@collect` statement returns a standard julia iterator that can be used with any normal julia language construct that can deal with iterators. The following code returns a julia iterator for the query results: -```@example 1 -q2 = @from i in df begin - @where i.age > 40 - @select {number_of_children=i.children, i.name} -end -nothing # hide +```jldoctest query +julia> q2 = @from i in df begin + @where i.age > 40 + @select {number_of_children=i.children, i.name} + end; # suppress printing the iterator type + ``` One can loop over the results using a standard julia `for` statement: -```@example 1 -total_children = 0 -for i in q2 - total_children += i.number_of_children -end +```jldoctest query +julia> total_children = 0 +0 + +julia> for i in q2 + total_children += i.number_of_children + end + +julia> total_children +4 -println("Total number of children: $(get(total_children))") ``` Or one can use a comprehension to extract the name of a subset of rows: -```@example 1 -y = [i.name for i in q2 if i.number_of_children > 0] +```jldoctest query +julia> y = [i.name for i in q2 if i.number_of_children > 0] +1-element Array{String,1}: + "Roger" + ``` The last example (extracting only the name and applying a second filter) could of course be completely expressed as a query expression: -```@example 1 -q3 = @from i in df begin - @where i.age > 40 && i.children > 0 - @select i.name - @collect -end +```jldoctest query +julia> q3 = @from i in df begin + @where i.age > 40 && i.children > 0 + @select i.name + @collect + end +1-element Array{String,1}: + "Roger" + ``` A query that ends with a `@collect` statement without a specific type will materialize the query results into an array. Note also the difference in the `@select` statement: The previous queries all used the `{}` syntax in the `@select` statement to project results into a tabular format. The last query instead just selects a single value from each row in the `@select` statement. diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index a532d35b8d..a94c7370d8 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -2,18 +2,87 @@ Reshape data from wide to long format using the `stack` function: -```julia -using DataFrames -using CSV -iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame) -iris[:id] = 1:size(iris, 1) # this makes it easier to unstack -d = stack(iris, 1:4) +```jldoctest reshape +julia> using DataFrames, CSV + +julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │ +│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │ +│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ +│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │ +│ 2 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │ +│ 3 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │ +│ 4 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │ +│ 5 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │ +│ 6 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │ + + +julia> d = stack(iris, 1:4); + +julia> head(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼─────────────┼───────┼─────────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ +│ 2 │ SepalLength │ 4.9 │ setosa │ +│ 3 │ SepalLength │ 4.7 │ setosa │ +│ 4 │ SepalLength │ 4.6 │ setosa │ +│ 5 │ SepalLength │ 5.0 │ setosa │ +│ 6 │ SepalLength │ 5.4 │ setosa │ + +julia> tail(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼────────────┼───────┼───────────┤ +│ 1 │ PetalWidth │ 2.5 │ virginica │ +│ 2 │ PetalWidth │ 2.3 │ virginica │ +│ 3 │ PetalWidth │ 1.9 │ virginica │ +│ 4 │ PetalWidth │ 2.0 │ virginica │ +│ 5 │ PetalWidth │ 2.3 │ virginica │ +│ 6 │ PetalWidth │ 1.8 │ virginica │ + ``` The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given: -```julia -d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]) +```jldoctest reshape +julia> d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]); + +julia> head(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼─────────────┼───────┼─────────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ +│ 2 │ SepalLength │ 4.9 │ setosa │ +│ 3 │ SepalLength │ 4.7 │ setosa │ +│ 4 │ SepalLength │ 4.6 │ setosa │ +│ 5 │ SepalLength │ 5.0 │ setosa │ +│ 6 │ SepalLength │ 5.4 │ setosa │ + +julia> tail(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼────────────┼───────┼───────────┤ +│ 1 │ PetalWidth │ 2.5 │ virginica │ +│ 2 │ PetalWidth │ 2.3 │ virginica │ +│ 3 │ PetalWidth │ 1.9 │ virginica │ +│ 4 │ PetalWidth │ 2.0 │ virginica │ +│ 5 │ PetalWidth │ 2.3 │ virginica │ +│ 6 │ PetalWidth │ 1.8 │ virginica │ + ``` Note that all columns can be of different types. Type promotion follows the rules of `vcat`. @@ -22,41 +91,176 @@ The stacked DataFrame that results includes all of the columns not specified to A third optional argument to `stack` represents the id columns that are repeated. This makes it easier to specify which variables you want included in the long format: -```julia -d = stack(iris, [:SepalLength, :SepalWidth], :Species) +```jldoctest reshape +julia> d = stack(iris, [:SepalLength, :SepalWidth], :Species); + +julia> head(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼─────────────┼───────┼─────────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ +│ 2 │ SepalLength │ 4.9 │ setosa │ +│ 3 │ SepalLength │ 4.7 │ setosa │ +│ 4 │ SepalLength │ 4.6 │ setosa │ +│ 5 │ SepalLength │ 5.0 │ setosa │ +│ 6 │ SepalLength │ 5.4 │ setosa │ + +julia> tail(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼────────────┼───────┼───────────┤ +│ 1 │ SepalWidth │ 3.3 │ virginica │ +│ 2 │ SepalWidth │ 3.0 │ virginica │ +│ 3 │ SepalWidth │ 2.5 │ virginica │ +│ 4 │ SepalWidth │ 3.0 │ virginica │ +│ 5 │ SepalWidth │ 3.4 │ virginica │ +│ 6 │ SepalWidth │ 3.0 │ virginica │ + ``` `melt` is an alternative function to reshape from wide to long format. It is based on `stack`, but it prefers specification of the id columns as: -```julia -d = melt(iris, :Species) -``` +```jldoctest reshape +julia> d = melt(iris, :Species); -All other columns are assumed to be measured variables (they are stacked). +julia> head(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼─────────────┼───────┼─────────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ +│ 2 │ SepalLength │ 4.9 │ setosa │ +│ 3 │ SepalLength │ 4.7 │ setosa │ +│ 4 │ SepalLength │ 4.6 │ setosa │ +│ 5 │ SepalLength │ 5.0 │ setosa │ +│ 6 │ SepalLength │ 5.4 │ setosa │ -You can also stack an entire DataFrame. The default stacks all floating-point columns: +julia> tail(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼────────────┼───────┼───────────┤ +│ 1 │ PetalWidth │ 2.5 │ virginica │ +│ 2 │ PetalWidth │ 2.3 │ virginica │ +│ 3 │ PetalWidth │ 1.9 │ virginica │ +│ 4 │ PetalWidth │ 2.0 │ virginica │ +│ 5 │ PetalWidth │ 2.3 │ virginica │ +│ 6 │ PetalWidth │ 1.8 │ virginica │ -```julia -d = stack(iris) ``` `unstack` converts from a long format to a wide format. The default is requires specifying which columns are an id variable, column variable names, and column values: -```julia -longdf = melt(iris, [:Species, :id]) -widedf = unstack(longdf, :id, :variable, :value) +```jldoctest reshape +julia> iris[:id] = 1:size(iris, 1) +1:150 + +julia> longdf = melt(iris, [:Species, :id]); + +julia> head(longdf) +6×4 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ id │ +├─────┼─────────────┼───────┼─────────┼────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │ +│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │ +│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │ +│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │ +│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │ +│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │ + +julia> tail(longdf) +6×4 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ id │ +├─────┼────────────┼───────┼───────────┼─────┤ +│ 1 │ PetalWidth │ 2.5 │ virginica │ 145 │ +│ 2 │ PetalWidth │ 2.3 │ virginica │ 146 │ +│ 3 │ PetalWidth │ 1.9 │ virginica │ 147 │ +│ 4 │ PetalWidth │ 2.0 │ virginica │ 148 │ +│ 5 │ PetalWidth │ 2.3 │ virginica │ 149 │ +│ 6 │ PetalWidth │ 1.8 │ virginica │ 150 │ + +julia> widedf = unstack(longdf, :id, :variable, :value); + +julia> head(widedf) +6×5 DataFrames.DataFrame +│ Row │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │ +├─────┼────┼─────────────┼────────────┼─────────────┼────────────┤ +│ 1 │ 1 │ 1.4 │ 0.2 │ 5.1 │ 3.5 │ +│ 2 │ 2 │ 1.4 │ 0.2 │ 4.9 │ 3.0 │ +│ 3 │ 3 │ 1.3 │ 0.2 │ 4.7 │ 3.2 │ +│ 4 │ 4 │ 1.5 │ 0.2 │ 4.6 │ 3.1 │ +│ 5 │ 5 │ 1.4 │ 0.2 │ 5.0 │ 3.6 │ +│ 6 │ 6 │ 1.7 │ 0.4 │ 5.4 │ 3.9 │ + +julia> tail(widedf) +6×5 DataFrames.DataFrame +│ Row │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │ +├─────┼─────┼─────────────┼────────────┼─────────────┼────────────┤ +│ 1 │ 145 │ 5.7 │ 2.5 │ 6.7 │ 3.3 │ +│ 2 │ 146 │ 5.2 │ 2.3 │ 6.7 │ 3.0 │ +│ 3 │ 147 │ 5.0 │ 1.9 │ 6.3 │ 2.5 │ +│ 4 │ 148 │ 5.2 │ 2.0 │ 6.5 │ 3.0 │ +│ 5 │ 149 │ 5.4 │ 2.3 │ 6.2 │ 3.4 │ +│ 6 │ 150 │ 5.1 │ 1.8 │ 5.9 │ 3.0 │ + ``` If the remaining columns are unique, you can skip the id variable and use: -```julia -widedf = unstack(longdf, :variable, :value) +```jldoctest reshape +julia> longdf = melt(iris, [:Species, :id]); + +julia> head(longdf) +6×4 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ id │ +├─────┼─────────────┼───────┼─────────┼────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │ +│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │ +│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │ +│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │ +│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │ +│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │ + +julia> widedf = unstack(longdf, :variable, :value); + +julia> head(widedf) +6×6 DataFrames.DataFrame +│ Row │ Species │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │ +├─────┼─────────┼────┼─────────────┼────────────┼─────────────┼────────────┤ +│ 1 │ setosa │ 1 │ 1.4 │ 0.2 │ 5.1 │ 3.5 │ +│ 2 │ setosa │ 2 │ 1.4 │ 0.2 │ 4.9 │ 3.0 │ +│ 3 │ setosa │ 3 │ 1.3 │ 0.2 │ 4.7 │ 3.2 │ +│ 4 │ setosa │ 4 │ 1.5 │ 0.2 │ 4.6 │ 3.1 │ +│ 5 │ setosa │ 5 │ 1.4 │ 0.2 │ 5.0 │ 3.6 │ +│ 6 │ setosa │ 6 │ 1.7 │ 0.4 │ 5.4 │ 3.9 │ + ``` `stackdf` and `meltdf` are two additional functions that work like `stack` and `melt`, but they provide a view into the original wide DataFrame. Here is an example: -```julia -d = stackdf(iris) +```jldoctest reshape +julia> d = stackdf(iris); + +julia> head(d) +6×4 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ id │ +├─────┼─────────────┼───────┼─────────┼────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │ +│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │ +│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │ +│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │ +│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │ +│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │ + +julia> tail(d) +6×4 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ id │ +├─────┼────────────┼───────┼───────────┼─────┤ +│ 1 │ PetalWidth │ 2.5 │ virginica │ 145 │ +│ 2 │ PetalWidth │ 2.3 │ virginica │ 146 │ +│ 3 │ PetalWidth │ 1.9 │ virginica │ 147 │ +│ 4 │ PetalWidth │ 2.0 │ virginica │ 148 │ +│ 5 │ PetalWidth │ 2.3 │ virginica │ 149 │ +│ 6 │ PetalWidth │ 1.8 │ virginica │ 150 │ + ``` This saves memory. To create the view, several AbstractVectors are defined: @@ -70,16 +274,43 @@ This is provides a view of the original columns stacked together. Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. -For more details on the storage representation, see: +None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: -```julia -dump(stackdf(iris)) -``` +```jldoctest reshape +julia> d = melt(iris, :Species); -None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: +julia> head(d) +6×3 DataFrames.DataFrame +│ Row │ variable │ value │ Species │ +├─────┼─────────────┼───────┼─────────┤ +│ 1 │ SepalLength │ 5.1 │ setosa │ +│ 2 │ SepalLength │ 4.9 │ setosa │ +│ 3 │ SepalLength │ 4.7 │ setosa │ +│ 4 │ SepalLength │ 4.6 │ setosa │ +│ 5 │ SepalLength │ 5.0 │ setosa │ +│ 6 │ SepalLength │ 5.4 │ setosa │ + +julia> x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(df[:value]))); + +julia> head(x) +6×3 DataFrames.DataFrame +│ Row │ variable │ Species │ vsum │ +├─────┼─────────────┼────────────┼───────┤ +│ 1 │ SepalLength │ setosa │ 5.006 │ +│ 2 │ SepalLength │ versicolor │ 5.936 │ +│ 3 │ SepalLength │ virginica │ 6.588 │ +│ 4 │ SepalWidth │ setosa │ 3.428 │ +│ 5 │ SepalWidth │ versicolor │ 2.77 │ +│ 6 │ SepalWidth │ virginica │ 2.974 │ + +julia> head(unstack(x, :Species, :vsum)) +5×4 DataFrames.DataFrame +│ Row │ variable │ setosa │ versicolor │ virginica │ +├─────┼─────────────┼────────┼────────────┼───────────┤ +│ 1 │ PetalLength │ 1.462 │ 4.26 │ 5.552 │ +│ 2 │ PetalWidth │ 0.246 │ 1.326 │ 2.026 │ +│ 3 │ SepalLength │ 5.006 │ 5.936 │ 6.588 │ +│ 4 │ SepalWidth │ 3.428 │ 2.77 │ 2.974 │ +│ 5 │ id │ 25.5 │ 75.5 │ 125.5 │ -```julia -d = stack(iris) -x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(Nulls.skip(df[:value])))) -unstack(x, :Species, :vsum) ``` diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md index 68f3db2683..e625e14830 100644 --- a/docs/src/man/sorting.md +++ b/docs/src/man/sorting.md @@ -2,21 +2,113 @@ Sorting is a fundamental component of data analysis. Basic sorting is trivial: just calling `sort!` will sort all columns, in place: -```julia -using DataFrames -using CSV -iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame) -sort!(iris) +```jldoctest sort +julia> using DataFrames, CSV + +julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")); + +julia> sort!(iris); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 4.3 │ 3.0 │ 1.1 │ 0.1 │ setosa │ +│ 2 │ 4.4 │ 2.9 │ 1.4 │ 0.2 │ setosa │ +│ 3 │ 4.4 │ 3.0 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ 4.4 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 5 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │ +│ 6 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 7.6 │ 3.0 │ 6.6 │ 2.1 │ virginica │ +│ 2 │ 7.7 │ 2.6 │ 6.9 │ 2.3 │ virginica │ +│ 3 │ 7.7 │ 2.8 │ 6.7 │ 2.0 │ virginica │ +│ 4 │ 7.7 │ 3.0 │ 6.1 │ 2.3 │ virginica │ +│ 5 │ 7.7 │ 3.8 │ 6.7 │ 2.2 │ virginica │ +│ 6 │ 7.9 │ 3.8 │ 6.4 │ 2.0 │ virginica │ + ``` In Sorting DataFrames, you may want to sort different columns with different options. Here are some examples showing most of the possible options: -```julia -sort!(iris, rev = true) -sort!(iris, cols = [:SepalWidth, :SepalLength]) +```jldoctest sort +julia> sort!(iris, rev = true); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 7.9 │ 3.8 │ 6.4 │ 2.0 │ virginica │ +│ 2 │ 7.7 │ 3.8 │ 6.7 │ 2.2 │ virginica │ +│ 3 │ 7.7 │ 3.0 │ 6.1 │ 2.3 │ virginica │ +│ 4 │ 7.7 │ 2.8 │ 6.7 │ 2.0 │ virginica │ +│ 5 │ 7.7 │ 2.6 │ 6.9 │ 2.3 │ virginica │ +│ 6 │ 7.6 │ 3.0 │ 6.6 │ 2.1 │ virginica │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ +│ 2 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │ +│ 3 │ 4.4 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ 4.4 │ 3.0 │ 1.3 │ 0.2 │ setosa │ +│ 5 │ 4.4 │ 2.9 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 4.3 │ 3.0 │ 1.1 │ 0.1 │ setosa │ + +julia> sort!(iris, cols = [:SepalWidth, :SepalLength]); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼────────────┤ +│ 1 │ 5.0 │ 2.0 │ 3.5 │ 1.0 │ versicolor │ +│ 2 │ 6.0 │ 2.2 │ 5.0 │ 1.5 │ virginica │ +│ 3 │ 6.0 │ 2.2 │ 4.0 │ 1.0 │ versicolor │ +│ 4 │ 6.2 │ 2.2 │ 4.5 │ 1.5 │ versicolor │ +│ 5 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │ +│ 6 │ 5.0 │ 2.3 │ 3.3 │ 1.0 │ versicolor │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │ +│ 2 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │ +│ 3 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │ +│ 4 │ 5.2 │ 4.1 │ 1.5 │ 0.1 │ setosa │ +│ 5 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │ + +julia> sort!(iris, cols = [order(:Species, by = uppercase), + order(:SepalLength, rev = true)]); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │ +│ 2 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │ +│ 3 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │ +│ 4 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │ +│ 5 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 5.4 │ 3.4 │ 1.7 │ 0.2 │ setosa │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 2 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 3 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │ +│ 4 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │ +│ 5 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │ +│ 6 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │ -sort!(iris, cols = [order(:Species, by = uppercase), - order(:SepalLength, rev = true)]) ``` Keywords used above include `cols` (to specify columns), `rev` (to sort a column or the whole DataFrame in reverse), and `by` (to apply a function to a column/DataFrame). Each keyword can either be a single value, or can be a tuple or array, with values corresponding to individual columns. @@ -25,9 +117,54 @@ As an alternative to using array or tuple values, `order` to specify an ordering The following two examples show two ways to sort the `iris` dataset with the same result: `Species` will be ordered in reverse lexicographic order, and within species, rows will be sorted by increasing sepal length and width: -```julia -sort!(iris, cols = (:Species, :SepalLength, :SepalWidth), - rev = (true, false, false)) +```jldoctest sort +julia> sort!(iris, cols = (:Species, :SepalLength, :SepalWidth), + rev = (true, false, false)); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │ +│ 2 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │ +│ 3 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │ +│ 4 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 5 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 6 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │ +│ 2 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │ +│ 3 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │ +│ 4 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │ +│ 5 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │ +│ 6 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │ + +julia> sort!(iris, cols = (order(:Species, rev = true), :SepalLength, :SepalWidth)); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │ +│ 2 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │ +│ 3 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │ +│ 4 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 5 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 6 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │ +│ 2 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │ +│ 3 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │ +│ 4 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │ +│ 5 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │ +│ 6 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │ -sort!(iris, cols = (order(:Species, rev = true), :SepalLength, :SepalWidth)) ``` diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 78fb637c39..2f87690b99 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -6,37 +6,104 @@ The DataFrames package supports the Split-Apply-Combine strategy through the `by We show several examples of the `by` function applied to the `iris` dataset below: -```julia -using DataFrames -using CSV -iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame) - -by(iris, :Species, size) -by(iris, :Species, df -> mean(Nulls.skip(df[:PetalLength]))) -by(iris, :Species, df -> DataFrame(N = size(df, 1))) +```jldoctest sac +julia> using DataFrames, CSV + +julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")); + +julia> head(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤ +│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │ +│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │ +│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ +│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │ + +julia> tail(iris) +6×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤ +│ 1 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │ +│ 2 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │ +│ 3 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │ +│ 4 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │ +│ 5 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │ +│ 6 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │ + + +julia> by(iris, :Species, size) +3×2 DataFrames.DataFrame +│ Row │ Species │ x1 │ +├─────┼────────────┼─────────┤ +│ 1 │ setosa │ (50, 5) │ +│ 2 │ versicolor │ (50, 5) │ +│ 3 │ virginica │ (50, 5) │ + +julia> by(iris, :Species, df -> mean(df[:PetalLength])) +3×2 DataFrames.DataFrame +│ Row │ Species │ x1 │ +├─────┼────────────┼───────┤ +│ 1 │ setosa │ 1.462 │ +│ 2 │ versicolor │ 4.26 │ +│ 3 │ virginica │ 5.552 │ + +julia> by(iris, :Species, df -> DataFrame(N = size(df, 1))) +3×2 DataFrames.DataFrame +│ Row │ Species │ N │ +├─────┼────────────┼────┤ +│ 1 │ setosa │ 50 │ +│ 2 │ versicolor │ 50 │ +│ 3 │ virginica │ 50 │ + ``` The `by` function also support the `do` block form: -```julia -by(iris, :Species) do df - DataFrame(m = mean(Nulls.skip(df[:PetalLength])), s² = var(Nulls.skip(df[:PetalLength]))) -end +```jldoctest sac +julia> by(iris, :Species) do df + DataFrame(m = mean(df[:PetalLength]), s² = var(df[:PetalLength])) + end +3×3 DataFrames.DataFrame +│ Row │ Species │ m │ s² │ +├─────┼────────────┼───────┼───────────┤ +│ 1 │ setosa │ 1.462 │ 0.0301592 │ +│ 2 │ versicolor │ 4.26 │ 0.220816 │ +│ 3 │ virginica │ 5.552 │ 0.304588 │ + ``` -A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column, that was not used to split the DataFrame, creating new columns of the form `$name_$function` e.g. `SepalLength_mean`. Anonymous functions and expressions that do not have a name will be called `λ1`. +A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column that was not used to split the DataFrame, creating new columns of the form `$name_$function`. For named functions like `mean` this will produce columns with names like `SepalLength_mean`. For anonymous functions like `x -> sqrt(x)^e`, which Julia tracks and references by a numerical identifier e.g. `#12`, the produced columns will be `SepalLength_#12`. We show several examples of the `aggregate` function applied to the `iris` dataset below: -We show several examples of the `aggregate` function applied to the `iris` dataset below: +```jldoctest sac +julia> aggregate(iris, :Species, length) +3×5 DataFrames.DataFrame +│ Row │ Species │ SepalLength_length │ SepalWidth_length │ PetalLength_length │ PetalWidth_length │ +├─────┼────────────┼────────────────────┼───────────────────┼────────────────────┼───────────────────┤ +│ 1 │ setosa │ 50 │ 50 │ 50 │ 50 │ +│ 2 │ versicolor │ 50 │ 50 │ 50 │ 50 │ +│ 3 │ virginica │ 50 │ 50 │ 50 │ 50 │ + +julia> aggregate(iris, :Species, [sum, mean]) +3×9 DataFrames.DataFrame +│ Row │ Species │ SepalLength_sum │ SepalWidth_sum │ PetalLength_sum │ PetalWidth_sum │ SepalLength_mean │ SepalWidth_mean │ PetalLength_mean │ PetalWidth_mean │ +├─────┼────────────┼─────────────────┼────────────────┼─────────────────┼────────────────┼──────────────────┼─────────────────┼──────────────────┼─────────────────┤ +│ 1 │ setosa │ 250.3 │ 171.4 │ 73.1 │ 12.3 │ 5.006 │ 3.428 │ 1.462 │ 0.246 │ +│ 2 │ versicolor │ 296.8 │ 138.5 │ 213.0 │ 66.3 │ 5.936 │ 2.77 │ 4.26 │ 1.326 │ +│ 3 │ virginica │ 329.4 │ 148.7 │ 277.6 │ 101.3 │ 6.588 │ 2.974 │ 5.552 │ 2.026 │ -```julia -aggregate(iris, :Species, sum) -aggregate(iris, :Species, [sum, x->mean(Nulls.skip(x))]) ``` If you only want to split the data set into subsets, use the `groupby` function: -```julia -for subdf in groupby(iris, :Species) - println(size(subdf, 1)) -end +```jldoctest sac +julia> for subdf in groupby(iris, :Species) + println(size(subdf, 1)) + end +50 +50 +50 + ``` diff --git a/docs/src/man/subsets.md b/docs/src/man/subsets.md index 29f4295dd6..b20d4ce189 100644 --- a/docs/src/man/subsets.md +++ b/docs/src/man/subsets.md @@ -2,7 +2,7 @@ A `DataFrame` supports many forms of indexing. -```julia +```jldoctest subsets julia> using DataFrames julia> df = DataFrame(A = 1:10, B = 2:2:20) @@ -19,11 +19,12 @@ julia> df = DataFrame(A = 1:10, B = 2:2:20) │ 8 │ 8 │ 16 │ │ 9 │ 9 │ 18 │ │ 10 │ 10 │ 20 │ + ``` Referring to the first column by index or name: -```julia +```jldoctest subsets julia> df[1] 10-element Array{Int64,1}: 1 @@ -49,21 +50,23 @@ julia> df[:A] 8 9 10 + ``` Refering to the first element of the first column: -```julia +```jldoctest subsets julia> df[1, 1] 1 julia> df[1, :A] 1 + ``` Selecting a subset of rows by index and an (ordered) subset of columns by name: -```julia +```jldoctest subsets julia> df[1:3, [:A, :B]] 3×2 DataFrames.DataFrame │ Row │ A │ B │ @@ -79,4 +82,5 @@ julia> df[1:3, [:B, :A]] │ 1 │ 2 │ 1 │ │ 2 │ 4 │ 2 │ │ 3 │ 6 │ 3 │ + ``` diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 2988c2610a..16c23d2239 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -49,7 +49,6 @@ export AbstractDataFrame, nrow, nullable!, order, - printtable, rename!, rename, showcols, diff --git a/src/other/utils.jl b/src/other/utils.jl index 8aba353e1a..bf2940d9eb 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -82,18 +82,12 @@ function make_unique(names::Vector{Symbol}; allow_duplicates=true) return names end -#' @description -#' -#' Generate standardized names for columns of a DataFrame. The -#' first name will be :x1, the second :x2, etc. -#' -#' @field n::Integer The number of names to generate. -#' -#' @returns names::Vector{Symbol} A vector of standardized column names. -#' -#' @examples -#' -#' DataFrames.gennames(10) +""" + gennames(n::Integer) + +Generate standardized names for columns of a DataFrame. The first name will be `:x1`, the +second `:x2`, etc. +""" function gennames(n::Integer) res = Array{Symbol}(n) for i in 1:n @@ -103,17 +97,11 @@ function gennames(n::Integer) end -#' @description -#' -#' Count the number of null values in an array. -#' -#' @field a::AbstractArray The array whose missing values are to be counted. -#' -#' @returns count::Int The number of null values in `a`. -#' -#' @examples -#' -#' DataFrames.countnull([1, 2, 3]) +""" + countnull(a::AbstractArray) + +Count the number of `null` values in an array. +""" function countnull(a::AbstractArray) res = 0 for x in a @@ -122,18 +110,6 @@ function countnull(a::AbstractArray) return res end -#' @description -#' -#' Count the number of missing values in a CategoricalArray. -#' -#' @field na::CategoricalArray The CategoricalArray whose missing values -#' are to be counted. -#' -#' @returns count::Int The number of null values in `a`. -#' -#' @examples -#' -#' DataFrames.countnull(CategoricalArray([1, 2, 3])) function countnull(a::CategoricalArray) res = 0 for x in a.refs @@ -155,4 +131,3 @@ function _fnames(fs::Vector{T}) where T<:Function end names end - diff --git a/test/io.jl b/test/io.jl index cd527946db..001026e42a 100644 --- a/test/io.jl +++ b/test/io.jl @@ -50,7 +50,7 @@ module TestIO G = nulls(3), H = fill(null, 3)) - @test sprint(printtable, df) == + @test sprint(DataFrames.printtable, df) == """ "A","B","C","D","E","F","G","H" 1,"'a'","A","a","A","1",null,null