diff --git a/docs/make.jl b/docs/make.jl
index 43c18440d4..991ff07528 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -19,16 +19,11 @@ makedocs(
"Reshaping" => "man/reshaping_and_pivoting.md",
"Sorting" => "man/sorting.md",
"Categorical Data" => "man/categorical.md",
- "Querying frameworks" => "man/querying_frameworks.md",
+ "Querying frameworks" => "man/querying_frameworks.md"
],
"API" => Any[
- "Main types" => "lib/maintypes.md",
- "Utilities" => "lib/utilities.md",
- "Data manipulation" => "lib/manipulation.md",
- ],
- "About" => Any[
- "Release Notes" => "NEWS.md",
- "License" => "LICENSE.md",
+ "Types" => "lib/types.md",
+ "Functions" => "lib/functions.md"
]
]
)
diff --git a/docs/src/LICENSE.md b/docs/src/LICENSE.md
deleted file mode 100644
index 2c7be2a321..0000000000
--- a/docs/src/LICENSE.md
+++ /dev/null
@@ -1,23 +0,0 @@
-DataFrames.jl is licensed under the MIT License:
-
-> Copyright (c) 2012-2015: Harlan Harris, EPRI (Tom Short's code), Chris DuBois,
-> John Myles White, and other contributors.
->
-> Permission is hereby granted, free of charge, to any person obtaining
-> a copy of this software and associated documentation files (the
-> "Software"), to deal in the Software without restriction, including
-> without limitation the rights to use, copy, modify, merge, publish,
-> distribute, sublicense, and/or sell copies of the Software, and to
-> permit persons to whom the Software is furnished to do so, subject to
-> the following conditions:
->
-> The above copyright notice and this permission notice shall be
-> included in all copies or substantial portions of the Software.
->
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-> NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-> LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-> OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-> WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/docs/src/NEWS.md b/docs/src/NEWS.md
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/src/index.md b/docs/src/index.md
index b6fe7e1dc9..27de73e917 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,21 +1,40 @@
-# DataFrames Documentation Outline
+# DataFrames.jl
+
+Welcome to the DataFrames documentation! This resource aims to teach you everything you need
+to know to get up and running with tabular data manipulation using the DataFrames.jl package
+and the Julia language. If there is something you expect DataFrames to be capable of, but
+cannot figure out how to do, please reach out with questions in Domains/Data on
+[Discourse](https://discourse.julialang.org/new-topic?title=[DataFrames%20Question]:%20&body=%23%20Question:%0A%0A%23%20Dataset%20(if%20applicable):%0A%0A%23%20Minimal%20Working%20Example%20(if%20applicable):%0A&category=Domains/Data&tags=question).
+Please report bugs by
+[opening an issue](https://github.com/JuliaData/DataFrames.jl/issues/new). You can follow
+the [**source**]() links throughout the documentation to jump right to the
+source files on GitHub to make pull requests for improving the documentation and function
+capabilities. Please review
+[DataFrames contributing guidelines](https://github.com/JuliaData/DataFrames.jl/blob/master/CONTRIBUTING.md)
+before submitting your first PR! Information on specific versions can be found on the [Release page](https://github.com/JuliaData/DataFrames.jl/releases).
## Package Manual
```@contents
-Pages = ["man/getting_started.md", "man/joins.md", "man/split_apply_combine.md", "man/reshaping_and_pivoting.md", "man/sorting.md", "man/categorical.md", "man/querying_frameworks.md"]
+Pages = ["man/getting_started.md",
+ "man/joins.md",
+ "man/split_apply_combine.md",
+ "man/reshaping_and_pivoting.md",
+ "man/sorting.md",
+ "man/categorical.md",
+ "man/querying_frameworks.md"]
Depth = 2
```
## API
```@contents
-Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"]
+Pages = ["lib/types.md", "lib/functions.md"]
Depth = 2
```
-## Documentation Index
+## Index
```@index
-Pages = ["lib/maintypes.md", "lib/manipulation.md", "lib/utilities.md"]
+Pages = ["lib/types.md", "lib/functions.md"]
```
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
new file mode 100644
index 0000000000..6e2d7a4d3e
--- /dev/null
+++ b/docs/src/lib/functions.md
@@ -0,0 +1,54 @@
+```@meta
+CurrentModule = DataFrames
+```
+
+# Functions
+
+```@index
+Pages = ["functions.md"]
+```
+
+## Grouping, Joining, and Split-Apply-Combine
+
+```@docs
+aggregate
+by
+colwise
+groupby
+join
+melt
+stack
+unstack
+stackdf
+meltdf
+```
+
+## Basics
+
+```@docs
+categorical!
+combine
+completecases
+deleterows!
+describe
+dropnull
+dropnull!
+eachcol
+eachrow
+eltypes
+head
+names
+names!
+nonunique
+nullable!
+order
+rename!
+rename
+show
+showcols
+size
+sort
+sort!
+tail
+unique!
+```
diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md
deleted file mode 100644
index 1f9f578d25..0000000000
--- a/docs/src/lib/manipulation.md
+++ /dev/null
@@ -1,25 +0,0 @@
-```@meta
-CurrentModule = DataFrames
-```
-
-# Data Manipulation
-
-```@index
-Pages = ["manipulation.md"]
-```
-
-## Joins
-
-```@docs
-join
-```
-
-## Reshaping
-
-```@docs
-melt
-stack
-unstack
-stackdf
-meltdf
-```
diff --git a/docs/src/lib/maintypes.md b/docs/src/lib/types.md
similarity index 60%
rename from docs/src/lib/maintypes.md
rename to docs/src/lib/types.md
index ccc62d530c..a9e1ac3760 100644
--- a/docs/src/lib/maintypes.md
+++ b/docs/src/lib/types.md
@@ -3,14 +3,17 @@
CurrentModule = DataFrames
```
-# Main Types
+# Types
```@index
-Pages = ["maintypes.md"]
+Pages = ["types.md"]
```
```@docs
AbstractDataFrame
DataFrame
+DataFrameRow
+GroupApplied
+GroupedDataFrame
SubDataFrame
```
diff --git a/docs/src/lib/utilities.md b/docs/src/lib/utilities.md
deleted file mode 100644
index d0439fab90..0000000000
--- a/docs/src/lib/utilities.md
+++ /dev/null
@@ -1,26 +0,0 @@
-```@meta
-CurrentModule = DataFrames
-```
-
-# Utilities
-
-```@index
-Pages = ["utilities.md"]
-```
-
-```@docs
-eltypes
-head
-completecases
-describe
-dropnull
-dropnull!
-dump
-names!
-nonunique
-rename
-rename!
-tail
-unique
-unique!
-```
diff --git a/docs/src/man/categorical.md b/docs/src/man/categorical.md
index ac41fa800e..85d9774ca4 100644
--- a/docs/src/man/categorical.md
+++ b/docs/src/man/categorical.md
@@ -2,52 +2,151 @@
Often, we have to deal with factors that take on a small number of levels:
-```julia
-v = ["Group A", "Group A", "Group A",
- "Group B", "Group B", "Group B"]
+```jldoctest categorical
+julia> v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"]
+6-element Array{String,1}:
+ "Group A"
+ "Group A"
+ "Group A"
+ "Group B"
+ "Group B"
+ "Group B"
+
```
The naive encoding used in an `Array` represents every entry of this vector as a full string. In contrast, we can represent the data more efficiently by replacing the strings with indices into a small pool of levels. This is what the `CategoricalArray` type does:
-```julia
-cv = CategoricalArray(["Group A", "Group A", "Group A",
- "Group B", "Group B", "Group B"])
+```jldoctest categorical
+julia> using CategoricalArrays
+
+julia> cv = CategoricalArray(v)
+6-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
+ "Group A"
+ "Group A"
+ "Group A"
+ "Group B"
+ "Group B"
+ "Group B"
+
```
`CategoricalArrays` support missing values via the `Nulls` package.
-```julia
-using Nulls
-cv = CategoricalArray(["Group A", null, "Group A",
- "Group B", "Group B", null])
+```jldoctest categorical
+julia> using Nulls
+
+julia> cv = CategoricalArray(["Group A", null, "Group A",
+ "Group B", "Group B", null])
+6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt32}:
+ "Group A"
+ null
+ "Group A"
+ "Group B"
+ "Group B"
+ null
```
In addition to representing repeated data efficiently, the `CategoricalArray` type allows us to determine efficiently the allowed levels of the variable at any time using the `levels` function (note that levels may or may not be actually used in the data):
-```julia
-levels(cv)
+```jldoctest categorical
+julia> levels(cv)
+2-element Array{String,1}:
+ "Group A"
+ "Group B"
+
```
The `levels!` function also allows changing the order of appearance of the levels, which can be useful for display purposes or when working with ordered variables.
-By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compact` function:
+```jldoctest categorical
+julia> levels!(cv, ["Group B", "Group A"]);
+
+julia> levels(cv)
+2-element Array{String,1}:
+ "Group B"
+ "Group A"
+
+julia> sort(cv)
+6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt32}:
+ "Group B"
+ "Group B"
+ "Group A"
+ "Group A"
+ null
+ null
-```julia
-cv = compact(cv)
```
-Often, you will have factors encoded inside a DataFrame with `Array` columns instead of `CategoricalArray` columns. You can do conversion of a single column using the `categorical` function:
+By default, a `CategoricalArray` is able to represent 232differents levels. You can use less memory by calling the `compress` function:
+
+```jldoctest categorical
+julia> cv = compress(cv)
+6-element CategoricalArrays.CategoricalArray{Union{Nulls.Null, String},1,UInt8}:
+ "Group A"
+ null
+ "Group A"
+ "Group B"
+ "Group B"
+ null
-```julia
-cv = categorical(v)
```
-Or you can edit the columns of a `DataFrame` in-place using the `categorical!` function:
+Often, you will have factors encoded inside a DataFrame with `Array` columns instead of
+`CategoricalArray` columns. You can convert one or more columns of the DataFrame using the
+`categorical!` function, which modifies the input DataFrame in-place.
+
+```jldoctest categorical
+julia> using DataFrames
+
+julia> df = DataFrame(A = ["A", "B", "C", "D", "D", "A"],
+ B = ["X", "X", "X", "Y", "Y", "Y"])
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ A │ X │
+│ 2 │ B │ X │
+│ 3 │ C │ X │
+│ 4 │ D │ Y │
+│ 5 │ D │ Y │
+│ 6 │ A │ Y │
+
+julia> eltypes(df)
+2-element Array{Type,1}:
+ String
+ String
+
+julia> categorical!(df, :A) # change the column `:A` to be categorical
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ A │ X │
+│ 2 │ B │ X │
+│ 3 │ C │ X │
+│ 4 │ D │ Y │
+│ 5 │ D │ Y │
+│ 6 │ A │ Y │
+
+julia> eltypes(df)
+2-element Array{Type,1}:
+ CategoricalArrays.CategoricalString{UInt32}
+ String
+
+julia> categorical!(df) # change all columns to be categorical
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ A │ X │
+│ 2 │ B │ X │
+│ 3 │ C │ X │
+│ 4 │ D │ Y │
+│ 5 │ D │ Y │
+│ 6 │ A │ Y │
+
+julia> eltypes(df)
+2-element Array{Type,1}:
+ CategoricalArrays.CategoricalString{UInt32}
+ CategoricalArrays.CategoricalString{UInt32}
-```julia
-df = DataFrame(A = [1, 1, 1, 2, 2, 2],
- B = ["X", "X", "X", "Y", "Y", "Y"])
-categorical!(df, [:A, :B])
```
Using categorical arrays is important for working with the [GLM package](https://github.com/JuliaStats/GLM.jl). When fitting regression models, `CategoricalArray` columns in the input are translated into 0/1 indicator columns in the `ModelMatrix` with one column for each of the levels of the `CategoricalArray`. This allows one to analyze categorical data efficiently.
diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index 10b0491917..63dc7cad69 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -13,7 +13,7 @@ Throughout the rest of this tutorial, we will assume that you have installed the
To get started, let's examine the `Null` type. `Null` is a type implemented by the [Nulls.jl](https://github.com/JuliaData/Nulls.jl) package to represent missing data. `null` is an instance of the type `Null` used to represent a missing value.
-```julia
+```jldoctest nulls
julia> using DataFrames
julia> null
@@ -26,7 +26,7 @@ Nulls.Null
The `Null` type lets users create `Vector`s and `DataFrame` columns with missing values. Here we create a vector with a null value and the element-type of the returned vector is `Union{Nulls.Null, Int64}`.
-```julia
+```jldoctest nulls
julia> x = [1, 2, null]
3-element Array{Union{Nulls.Null, Int64},1}:
1
@@ -46,15 +46,15 @@ true
`null` values can be excluded when performing operations by using `Nulls.skip`, which returns a memory-efficient iterator.
-```julia
+```jldoctest nulls
julia> Nulls.skip(x)
-Base.Generator{Base.Iterators.Filter{Nulls.##4#6{Nulls.Null},Array{Union{Nulls.Null, Int64},1}},Nulls.##3#5}(Nulls.#3, Base.Iterators.Filter{Nulls.##4#6{Nulls.Null},Array{Union{Nulls.Null, Int64},1}}(Nulls.#4, Union{Nulls.Null, Int64}[1, 2, null]))
+Base.Generator{Base.Iterators.Filter{Nulls.##4#6,Array{Union{Int64, Nulls.Null},1}},Nulls.##3#5}(Nulls.#3, Base.Iterators.Filter{Nulls.##4#6,Array{Union{Int64, Nulls.Null},1}}(Nulls.#4, Union{Int64, Nulls.Null}[1, 2, null]))
```
The output of `Nulls.skip` can be passed directly into functions as an argument. For example, we can find the `sum` of all non-null values or `collect` the non-null values into a new null-free vector.
-```julia
+```jldoctest nulls
julia> sum(Nulls.skip(x))
3
@@ -67,7 +67,7 @@ julia> collect(Nulls.skip(x))
`null` elements can be replaced with other values via `Nulls.replace`.
-```julia
+```jldoctest nulls
julia> collect(Nulls.replace(x, 1))
3-element Array{Int64,1}:
1
@@ -78,7 +78,10 @@ julia> collect(Nulls.replace(x, 1))
The function `Nulls.T` returns the element-type `T` in `Union{T, Null}`.
-```julia
+```jldoctest nulls
+julia> eltype(x)
+Union{Int64, Nulls.Null}
+
julia> Nulls.T(eltype(x))
Int64
@@ -86,7 +89,7 @@ Int64
Use `nulls` to generate nullable `Vector`s and `Array`s, using the optional first argument to specify the element-type.
-```julia
+```jldoctest nulls
julia> nulls(1)
1-element Array{Nulls.Null,1}:
null
@@ -111,61 +114,185 @@ julia> nulls(Int, 1, 3)
The `DataFrame` type can be used to represent data tables, each column of which is a vector. You can specify the columns using keyword arguments or pairs:
-```julia
-df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"])
-df = DataFrame(:A => 1:4, :B => ["M", "F", "F", "M"])
+```jldoctest dataframe
+julia> using DataFrames
+
+julia> DataFrame(A = 1:4, B = ["M", "F", "F", "M"])
+4×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ 1 │ M │
+│ 2 │ 2 │ F │
+│ 3 │ 3 │ F │
+│ 4 │ 4 │ M │
+
```
It is also possible to construct a `DataFrame` in stages:
-```julia
-df = DataFrame()
-df[:A] = 1:8
-df[:B] = ["M", "F", "F", "M", "F", "M", "M", "F"]
-df
+```jldoctest dataframe
+julia> df = DataFrame()
+0×0 DataFrames.DataFrame
+
+
+julia> df[:A] = 1:8
+1:8
+
+julia> df[:B] = ["M", "F", "F", "M", "F", "M", "M", "F"]
+8-element Array{String,1}:
+ "M"
+ "F"
+ "F"
+ "M"
+ "F"
+ "M"
+ "M"
+ "F"
+
+julia> df
+8×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ 1 │ M │
+│ 2 │ 2 │ F │
+│ 3 │ 3 │ F │
+│ 4 │ 4 │ M │
+│ 5 │ 5 │ F │
+│ 6 │ 6 │ M │
+│ 7 │ 7 │ M │
+│ 8 │ 8 │ F │
+
```
-The `DataFrame` we build in this way has 8 rows and 2 columns. You can check this using `size` function:
+The `DataFrame` we build in this way has 8 rows and 2 columns. You can check this using the
+`size` function:
+
+```jldoctest dataframe
+julia> size(df, 1) == 8
+true
+
+julia> size(df, 2) == 2
+true
+
+julia> size(df) == (8, 2)
+true
-```julia
-nrows = size(df, 1)
-ncols = size(df, 2)
```
We can also look at small subsets of the data in a couple of different ways:
-```julia
-head(df)
-tail(df)
+```jldoctest dataframe
+julia> head(df)
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ 1 │ M │
+│ 2 │ 2 │ F │
+│ 3 │ 3 │ F │
+│ 4 │ 4 │ M │
+│ 5 │ 5 │ F │
+│ 6 │ 6 │ M │
+
+julia> tail(df)
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ 3 │ F │
+│ 2 │ 4 │ M │
+│ 3 │ 5 │ F │
+│ 4 │ 6 │ M │
+│ 5 │ 7 │ M │
+│ 6 │ 8 │ F │
+
+julia> df[1:3, :]
+3×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1 │ 1 │ M │
+│ 2 │ 2 │ F │
+│ 3 │ 3 │ F │
-df[1:3, :]
```
Having seen what some of the rows look like, we can try to summarize the entire data set using `describe`:
-```julia
-describe(df)
+```jldoctest dataframe
+julia> describe(df)
+A
+Summary Stats:
+Mean: 4.500000
+Minimum: 1.000000
+1st Quartile: 2.750000
+Median: 4.500000
+3rd Quartile: 6.250000
+Maximum: 8.000000
+Length: 8
+Type: Int64
+
+B
+Summary Stats:
+Length: 8
+Type: String
+Number Unique: 2
+
+
```
-To focus our search, we start looking at just the means and medians of specific columns. In the example below, we use numeric indexing to access the columns of the `DataFrame`:
+To access individual columns of the dataset, you refer to the column names by their symbol
+or by their numerical index. Here we extract the first column, `:A`, and use it to compute
+the mean and variance.
+
+```jldoctest dataframe
+julia> mean(df[:A]) == mean(df[1]) == 4.5
+true
+
+julia> var(df[:A]) == var(df[1]) == 6.0
+true
-```julia
-mean(Nulls.skip(df[1]))
-median(Nulls.skip(df[1]))
```
-We could also have used column names to access individual columns:
+If your dataset has missing values, most functions will require you to remove them
+beforehand. Here we will replace all odd-numbered rows in the first column with missing data
+to show how to handle the above example when missing values are present in your dataset.
+
+```jldoctest dataframe
+julia> df[:A] = [isodd(i) ? null : value for (i, value) in enumerate(df[:A])];
+
+julia> df
+8×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼──────┼───┤
+│ 1 │ null │ M │
+│ 2 │ 2 │ F │
+│ 3 │ null │ F │
+│ 4 │ 4 │ M │
+│ 5 │ null │ F │
+│ 6 │ 6 │ M │
+│ 7 │ null │ M │
+│ 8 │ 8 │ F │
+
+julia> mean(Nulls.skip(df[:A]))
+5.0
-```julia
-mean(Nulls.skip(df[:A]))
-median(Nulls.skip(df[:A]))
```
We can also apply a function to each column of a `DataFrame` with the `colwise` function. For example:
-```julia
-df = DataFrame(A = 1:4, B = randn(4))
-colwise(c->cumsum(Nulls.skip(c)), df)
+```jldoctest dataframe
+julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
+4×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼─────┤
+│ 1 │ 1 │ 4.0 │
+│ 2 │ 2 │ 3.0 │
+│ 3 │ 3 │ 2.0 │
+│ 4 │ 4 │ 1.0 │
+
+julia> colwise(sum, df)
+2-element Array{Real,1}:
+ 10
+ 10.0
+
```
## Importing and Exporting Data (I/O)
@@ -199,10 +326,22 @@ The behavior of CSV functions can be adapted via keyword arguments. For more inf
To see more of the functionality for working with `DataFrame` objects, we need a more complex data set to work with. We can access Fisher's iris data set using the following functions:
-```julia
-using CSV
-iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"))
-head(iris)
+```jldoctest csv
+julia> using DataFrames, CSV
+
+julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"));
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │
+│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │
+│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │
+│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │
+│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │
+
```
## Querying DataFrames
diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md
index 94c7e8452e..aca3848b89 100644
--- a/docs/src/man/joins.md
+++ b/docs/src/man/joins.md
@@ -2,23 +2,36 @@
We often need to combine two or more data sets together to provide a complete picture of the topic we are studying. For example, suppose that we have the following two data sets:
-```julia
-names = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"])
-jobs = DataFrame(ID = [20, 40], Job = ["Lawyer", "Doctor"])
+```jldoctest joins
+julia> using DataFrames
+
+julia> names = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"])
+2×2 DataFrames.DataFrame
+│ Row │ ID │ Name │
+├─────┼────┼──────────┤
+│ 1 │ 20 │ John Doe │
+│ 2 │ 40 │ Jane Doe │
+
+julia> jobs = DataFrame(ID = [20, 40], Job = ["Lawyer", "Doctor"])
+2×2 DataFrames.DataFrame
+│ Row │ ID │ Job │
+├─────┼────┼────────┤
+│ 1 │ 20 │ Lawyer │
+│ 2 │ 40 │ Doctor │
+
```
We might want to work with a larger data set that contains both the names and jobs for each ID. We can do this using the `join` function:
-```julia
-full = join(names, jobs, on = :ID)
-```
-
-Output:
+```jldoctest joins
+julia> join(names, jobs, on = :ID)
+2×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+│ 2 │ 40 │ Jane Doe │ Doctor │
-| Row | ID | Name | Job |
-|-----|----|------------|----------|
-| 1 | 20 | "John Doe" | "Lawyer" |
-| 2 | 40 | "Jane Doe" | "Doctor" |
+```
In relational database theory, this operation is generally referred to as a join. The columns used to determine which rows should be combined during a join are called keys.
@@ -32,43 +45,156 @@ There are seven kinds of joins supported by the DataFrames package:
- Anti: The output contains rows for values of the key that exist in the first (left) but not the second (right) argument to `join`. As with semi joins, output is restricted to columns from the first (left) argument.
- Cross: The output is the cartesian product of rows from the first (left) and second (right) arguments to `join`.
+See [the Wikipedia page on SQL joins](https://en.wikipedia.org/wiki/Join_(SQL)) for more information.
+
You can control the kind of join that `join` performs using the `kind` keyword argument:
-```julia
-a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"])
-b = DataFrame(ID = [20, 60], Job = ["Lawyer", "Astronaut"])
-join(a, b, on = :ID, kind = :inner)
-join(a, b, on = :ID, kind = :left)
-join(a, b, on = :ID, kind = :right)
-join(a, b, on = :ID, kind = :outer)
-join(a, b, on = :ID, kind = :semi)
-join(a, b, on = :ID, kind = :anti)
+```jldoctest joins
+julia> jobs = DataFrame(ID = [20, 60], Job = ["Lawyer", "Astronaut"])
+2×2 DataFrames.DataFrame
+│ Row │ ID │ Job │
+├─────┼────┼───────────┤
+│ 1 │ 20 │ Lawyer │
+│ 2 │ 60 │ Astronaut │
+
+julia> join(names, jobs, on = :ID, kind = :inner)
+1×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+
+julia> join(names, jobs, on = :ID, kind = :left)
+2×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+│ 2 │ 40 │ Jane Doe │ null │
+
+julia> join(names, jobs, on = :ID, kind = :right)
+2×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼───────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+│ 2 │ 60 │ null │ Astronaut │
+
+julia> join(names, jobs, on = :ID, kind = :outer)
+3×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼───────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+│ 2 │ 40 │ Jane Doe │ null │
+│ 3 │ 60 │ null │ Astronaut │
+
+julia> join(names, jobs, on = :ID, kind = :semi)
+1×2 DataFrames.DataFrame
+│ Row │ ID │ Name │
+├─────┼────┼──────────┤
+│ 1 │ 20 │ John Doe │
+
+julia> join(names, jobs, on = :ID, kind = :anti)
+1×2 DataFrames.DataFrame
+│ Row │ ID │ Name │
+├─────┼────┼──────────┤
+│ 1 │ 40 │ Jane Doe │
+
```
Cross joins are the only kind of join that does not use a key:
-```julia
-join(a, b, kind = :cross)
+```jldoctest joins
+julia> join(names, jobs, kind = :cross)
+4×4 DataFrames.DataFrame
+│ Row │ ID │ Name │ ID_1 │ Job │
+├─────┼────┼──────────┼──────┼───────────┤
+│ 1 │ 20 │ John Doe │ 20 │ Lawyer │
+│ 2 │ 20 │ John Doe │ 60 │ Astronaut │
+│ 3 │ 40 │ Jane Doe │ 20 │ Lawyer │
+│ 4 │ 40 │ Jane Doe │ 60 │ Astronaut │
+
```
In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!:
-```julia
-a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"])
-b = DataFrame(IDNew = [20, 40], Job = ["Lawyer", "Doctor"])
-rename!(b, :IDNew, :ID)
-join(a, b, on = :ID, kind = :inner)
+```jldoctest joins
+julia> a = DataFrame(ID = [20, 40], Name = ["John Doe", "Jane Doe"])
+2×2 DataFrames.DataFrame
+│ Row │ ID │ Name │
+├─────┼────┼──────────┤
+│ 1 │ 20 │ John Doe │
+│ 2 │ 40 │ Jane Doe │
+
+julia> b = DataFrame(IDNew = [20, 40], Job = ["Lawyer", "Doctor"])
+2×2 DataFrames.DataFrame
+│ Row │ IDNew │ Job │
+├─────┼───────┼────────┤
+│ 1 │ 20 │ Lawyer │
+│ 2 │ 40 │ Doctor │
+
+julia> rename!(b, :IDNew, :ID)
+2×2 DataFrames.DataFrame
+│ Row │ ID │ Job │
+├─────┼────┼────────┤
+│ 1 │ 20 │ Lawyer │
+│ 2 │ 40 │ Doctor │
+
+julia> join(a, b, on = :ID, kind = :inner)
+2×3 DataFrames.DataFrame
+│ Row │ ID │ Name │ Job │
+├─────┼────┼──────────┼────────┤
+│ 1 │ 20 │ John Doe │ Lawyer │
+│ 2 │ 40 │ Jane Doe │ Doctor │
+
```
Or renaming multiple columns at a time:
-```julia
-a = DataFrame(City = ["Amsterdam", "London", "London", "New York", "New York"],
- Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
- Category = [1, 2, 3, 4, 5])
-b = DataFrame(Location = ["Amsterdam", "London", "London", "New York", "New York"],
- Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
- Name = ["a", "b", "c", "d", "e"])
-rename!(b, [:Location => :City, :Work => :Job])
-join(a, b, on = [:City, :Job])
+```jldoctest joins
+julia> a = DataFrame(City = ["Amsterdam", "London", "London", "New York", "New York"],
+ Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
+ Category = [1, 2, 3, 4, 5])
+5×3 DataFrames.DataFrame
+│ Row │ City │ Job │ Category │
+├─────┼───────────┼────────┼──────────┤
+│ 1 │ Amsterdam │ Lawyer │ 1 │
+│ 2 │ London │ Lawyer │ 2 │
+│ 3 │ London │ Lawyer │ 3 │
+│ 4 │ New York │ Doctor │ 4 │
+│ 5 │ New York │ Doctor │ 5 │
+
+julia> b = DataFrame(Location = ["Amsterdam", "London", "London", "New York", "New York"],
+ Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
+ Name = ["a", "b", "c", "d", "e"])
+5×3 DataFrames.DataFrame
+│ Row │ Location │ Work │ Name │
+├─────┼───────────┼────────┼──────┤
+│ 1 │ Amsterdam │ Lawyer │ a │
+│ 2 │ London │ Lawyer │ b │
+│ 3 │ London │ Lawyer │ c │
+│ 4 │ New York │ Doctor │ d │
+│ 5 │ New York │ Doctor │ e │
+
+julia> rename!(b, [:Location => :City, :Work => :Job])
+5×3 DataFrames.DataFrame
+│ Row │ City │ Job │ Name │
+├─────┼───────────┼────────┼──────┤
+│ 1 │ Amsterdam │ Lawyer │ a │
+│ 2 │ London │ Lawyer │ b │
+│ 3 │ London │ Lawyer │ c │
+│ 4 │ New York │ Doctor │ d │
+│ 5 │ New York │ Doctor │ e │
+
+julia> join(a, b, on = [:City, :Job])
+9×4 DataFrames.DataFrame
+│ Row │ City │ Job │ Category │ Name │
+├─────┼───────────┼────────┼──────────┼──────┤
+│ 1 │ Amsterdam │ Lawyer │ 1 │ a │
+│ 2 │ London │ Lawyer │ 2 │ b │
+│ 3 │ London │ Lawyer │ 2 │ c │
+│ 4 │ London │ Lawyer │ 3 │ b │
+│ 5 │ London │ Lawyer │ 3 │ c │
+│ 6 │ New York │ Doctor │ 4 │ d │
+│ 7 │ New York │ Doctor │ 4 │ e │
+│ 8 │ New York │ Doctor │ 5 │ d │
+│ 9 │ New York │ Doctor │ 5 │ e │
+
```
diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md
index 398d03a373..d883281011 100644
--- a/docs/src/man/querying_frameworks.md
+++ b/docs/src/man/querying_frameworks.md
@@ -12,59 +12,77 @@ A query is started with the `@from` macro and consists of a series of query comm
A simple example of a query looks like this:
-```@setup 1
-using DataFrames, Query
-```
-
-```@example 1
-using DataFrames, Query
+```jldoctest query
+julia> using DataFrames, Query
+
+julia> df = DataFrame(name=["John", "Sally", "Roger"], age=[54., 34., 79.], children=[0, 2, 4])
+3×3 DataFrames.DataFrame
+│ Row │ name │ age │ children │
+├─────┼───────┼──────┼──────────┤
+│ 1 │ John │ 54.0 │ 0 │
+│ 2 │ Sally │ 34.0 │ 2 │
+│ 3 │ Roger │ 79.0 │ 4 │
+
+julia> q1 = @from i in df begin
+ @where i.age > 40
+ @select {number_of_children=i.children, i.name}
+ @collect DataFrame
+ end
+2×2 DataFrames.DataFrame
+│ Row │ number_of_children │ name │
+├─────┼────────────────────┼───────┤
+│ 1 │ 0 │ John │
+│ 2 │ 4 │ Roger │
-df = DataFrame(name=["John", "Sally", "Roger"], age=[54., 34., 79.], children=[0, 2, 4])
-
-q1 = @from i in df begin
- @where i.age > 40
- @select {number_of_children=i.children, i.name}
- @collect DataFrame
-end
```
The query starts with the `@from` macro. The first argument `i` is the name of the range variable that will be used to refer to an individual row in later query commands. The next argument `df` is the data source that one wants to query. The `@where` command in this query will filter the source data by applying the filter condition `i.age > 40`. This filters out any rows in which the `age` column is not larger than 40. The `@select` command then projects the columns of the source data onto a new column structure. The example here applies three specific modifications: 1) it only keeps a subset of the columns in the source `DataFrame`, i.e. the `age` column will not be part of the transformed data; 2) it changes the order of the two columns that are selected; and 3) it renames one of the columns that is selected from `children` to `number_of_children`. The example query uses the `{}` syntax to achieve this. A `{}` in a Query.jl expression instantiates a new [NamedTuple](https://github.com/blackrock/NamedTuples.jl), i.e. it is a shortcut for writing `@NT(number_of_children=>i.children, name=>i.name)`. The `@collect` statement determines the data structure that the query returns. In this example the results are returned as a `DataFrame`.
A query without a `@collect` statement returns a standard julia iterator that can be used with any normal julia language construct that can deal with iterators. The following code returns a julia iterator for the query results:
-```@example 1
-q2 = @from i in df begin
- @where i.age > 40
- @select {number_of_children=i.children, i.name}
-end
-nothing # hide
+```jldoctest query
+julia> q2 = @from i in df begin
+ @where i.age > 40
+ @select {number_of_children=i.children, i.name}
+ end; # suppress printing the iterator type
+
```
One can loop over the results using a standard julia `for` statement:
-```@example 1
-total_children = 0
-for i in q2
- total_children += i.number_of_children
-end
+```jldoctest query
+julia> total_children = 0
+0
+
+julia> for i in q2
+ total_children += i.number_of_children
+ end
+
+julia> total_children
+4
-println("Total number of children: $(get(total_children))")
```
Or one can use a comprehension to extract the name of a subset of rows:
-```@example 1
-y = [i.name for i in q2 if i.number_of_children > 0]
+```jldoctest query
+julia> y = [i.name for i in q2 if i.number_of_children > 0]
+1-element Array{String,1}:
+ "Roger"
+
```
The last example (extracting only the name and applying a second filter) could of course be completely expressed as a query expression:
-```@example 1
-q3 = @from i in df begin
- @where i.age > 40 && i.children > 0
- @select i.name
- @collect
-end
+```jldoctest query
+julia> q3 = @from i in df begin
+ @where i.age > 40 && i.children > 0
+ @select i.name
+ @collect
+ end
+1-element Array{String,1}:
+ "Roger"
+
```
A query that ends with a `@collect` statement without a specific type will materialize the query results into an array. Note also the difference in the `@select` statement: The previous queries all used the `{}` syntax in the `@select` statement to project results into a tabular format. The last query instead just selects a single value from each row in the `@select` statement.
diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md
index a532d35b8d..a94c7370d8 100644
--- a/docs/src/man/reshaping_and_pivoting.md
+++ b/docs/src/man/reshaping_and_pivoting.md
@@ -2,18 +2,87 @@
Reshape data from wide to long format using the `stack` function:
-```julia
-using DataFrames
-using CSV
-iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame)
-iris[:id] = 1:size(iris, 1) # this makes it easier to unstack
-d = stack(iris, 1:4)
+```jldoctest reshape
+julia> using DataFrames, CSV
+
+julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"));
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │
+│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │
+│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │
+│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │
+│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │
+│ 2 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │
+│ 3 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │
+│ 4 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │
+│ 5 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │
+│ 6 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │
+
+
+julia> d = stack(iris, 1:4);
+
+julia> head(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼─────────────┼───────┼─────────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │
+│ 2 │ SepalLength │ 4.9 │ setosa │
+│ 3 │ SepalLength │ 4.7 │ setosa │
+│ 4 │ SepalLength │ 4.6 │ setosa │
+│ 5 │ SepalLength │ 5.0 │ setosa │
+│ 6 │ SepalLength │ 5.4 │ setosa │
+
+julia> tail(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼────────────┼───────┼───────────┤
+│ 1 │ PetalWidth │ 2.5 │ virginica │
+│ 2 │ PetalWidth │ 2.3 │ virginica │
+│ 3 │ PetalWidth │ 1.9 │ virginica │
+│ 4 │ PetalWidth │ 2.0 │ virginica │
+│ 5 │ PetalWidth │ 2.3 │ virginica │
+│ 6 │ PetalWidth │ 1.8 │ virginica │
+
```
The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given:
-```julia
-d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
+```jldoctest reshape
+julia> d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]);
+
+julia> head(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼─────────────┼───────┼─────────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │
+│ 2 │ SepalLength │ 4.9 │ setosa │
+│ 3 │ SepalLength │ 4.7 │ setosa │
+│ 4 │ SepalLength │ 4.6 │ setosa │
+│ 5 │ SepalLength │ 5.0 │ setosa │
+│ 6 │ SepalLength │ 5.4 │ setosa │
+
+julia> tail(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼────────────┼───────┼───────────┤
+│ 1 │ PetalWidth │ 2.5 │ virginica │
+│ 2 │ PetalWidth │ 2.3 │ virginica │
+│ 3 │ PetalWidth │ 1.9 │ virginica │
+│ 4 │ PetalWidth │ 2.0 │ virginica │
+│ 5 │ PetalWidth │ 2.3 │ virginica │
+│ 6 │ PetalWidth │ 1.8 │ virginica │
+
```
Note that all columns can be of different types. Type promotion follows the rules of `vcat`.
@@ -22,41 +91,176 @@ The stacked DataFrame that results includes all of the columns not specified to
A third optional argument to `stack` represents the id columns that are repeated. This makes it easier to specify which variables you want included in the long format:
-```julia
-d = stack(iris, [:SepalLength, :SepalWidth], :Species)
+```jldoctest reshape
+julia> d = stack(iris, [:SepalLength, :SepalWidth], :Species);
+
+julia> head(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼─────────────┼───────┼─────────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │
+│ 2 │ SepalLength │ 4.9 │ setosa │
+│ 3 │ SepalLength │ 4.7 │ setosa │
+│ 4 │ SepalLength │ 4.6 │ setosa │
+│ 5 │ SepalLength │ 5.0 │ setosa │
+│ 6 │ SepalLength │ 5.4 │ setosa │
+
+julia> tail(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼────────────┼───────┼───────────┤
+│ 1 │ SepalWidth │ 3.3 │ virginica │
+│ 2 │ SepalWidth │ 3.0 │ virginica │
+│ 3 │ SepalWidth │ 2.5 │ virginica │
+│ 4 │ SepalWidth │ 3.0 │ virginica │
+│ 5 │ SepalWidth │ 3.4 │ virginica │
+│ 6 │ SepalWidth │ 3.0 │ virginica │
+
```
`melt` is an alternative function to reshape from wide to long format. It is based on `stack`, but it prefers specification of the id columns as:
-```julia
-d = melt(iris, :Species)
-```
+```jldoctest reshape
+julia> d = melt(iris, :Species);
-All other columns are assumed to be measured variables (they are stacked).
+julia> head(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼─────────────┼───────┼─────────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │
+│ 2 │ SepalLength │ 4.9 │ setosa │
+│ 3 │ SepalLength │ 4.7 │ setosa │
+│ 4 │ SepalLength │ 4.6 │ setosa │
+│ 5 │ SepalLength │ 5.0 │ setosa │
+│ 6 │ SepalLength │ 5.4 │ setosa │
-You can also stack an entire DataFrame. The default stacks all floating-point columns:
+julia> tail(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼────────────┼───────┼───────────┤
+│ 1 │ PetalWidth │ 2.5 │ virginica │
+│ 2 │ PetalWidth │ 2.3 │ virginica │
+│ 3 │ PetalWidth │ 1.9 │ virginica │
+│ 4 │ PetalWidth │ 2.0 │ virginica │
+│ 5 │ PetalWidth │ 2.3 │ virginica │
+│ 6 │ PetalWidth │ 1.8 │ virginica │
-```julia
-d = stack(iris)
```
`unstack` converts from a long format to a wide format. The default is requires specifying which columns are an id variable, column variable names, and column values:
-```julia
-longdf = melt(iris, [:Species, :id])
-widedf = unstack(longdf, :id, :variable, :value)
+```jldoctest reshape
+julia> iris[:id] = 1:size(iris, 1)
+1:150
+
+julia> longdf = melt(iris, [:Species, :id]);
+
+julia> head(longdf)
+6×4 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │ id │
+├─────┼─────────────┼───────┼─────────┼────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │
+│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │
+│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │
+│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │
+│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │
+│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │
+
+julia> tail(longdf)
+6×4 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │ id │
+├─────┼────────────┼───────┼───────────┼─────┤
+│ 1 │ PetalWidth │ 2.5 │ virginica │ 145 │
+│ 2 │ PetalWidth │ 2.3 │ virginica │ 146 │
+│ 3 │ PetalWidth │ 1.9 │ virginica │ 147 │
+│ 4 │ PetalWidth │ 2.0 │ virginica │ 148 │
+│ 5 │ PetalWidth │ 2.3 │ virginica │ 149 │
+│ 6 │ PetalWidth │ 1.8 │ virginica │ 150 │
+
+julia> widedf = unstack(longdf, :id, :variable, :value);
+
+julia> head(widedf)
+6×5 DataFrames.DataFrame
+│ Row │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │
+├─────┼────┼─────────────┼────────────┼─────────────┼────────────┤
+│ 1 │ 1 │ 1.4 │ 0.2 │ 5.1 │ 3.5 │
+│ 2 │ 2 │ 1.4 │ 0.2 │ 4.9 │ 3.0 │
+│ 3 │ 3 │ 1.3 │ 0.2 │ 4.7 │ 3.2 │
+│ 4 │ 4 │ 1.5 │ 0.2 │ 4.6 │ 3.1 │
+│ 5 │ 5 │ 1.4 │ 0.2 │ 5.0 │ 3.6 │
+│ 6 │ 6 │ 1.7 │ 0.4 │ 5.4 │ 3.9 │
+
+julia> tail(widedf)
+6×5 DataFrames.DataFrame
+│ Row │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │
+├─────┼─────┼─────────────┼────────────┼─────────────┼────────────┤
+│ 1 │ 145 │ 5.7 │ 2.5 │ 6.7 │ 3.3 │
+│ 2 │ 146 │ 5.2 │ 2.3 │ 6.7 │ 3.0 │
+│ 3 │ 147 │ 5.0 │ 1.9 │ 6.3 │ 2.5 │
+│ 4 │ 148 │ 5.2 │ 2.0 │ 6.5 │ 3.0 │
+│ 5 │ 149 │ 5.4 │ 2.3 │ 6.2 │ 3.4 │
+│ 6 │ 150 │ 5.1 │ 1.8 │ 5.9 │ 3.0 │
+
```
If the remaining columns are unique, you can skip the id variable and use:
-```julia
-widedf = unstack(longdf, :variable, :value)
+```jldoctest reshape
+julia> longdf = melt(iris, [:Species, :id]);
+
+julia> head(longdf)
+6×4 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │ id │
+├─────┼─────────────┼───────┼─────────┼────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │
+│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │
+│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │
+│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │
+│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │
+│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │
+
+julia> widedf = unstack(longdf, :variable, :value);
+
+julia> head(widedf)
+6×6 DataFrames.DataFrame
+│ Row │ Species │ id │ PetalLength │ PetalWidth │ SepalLength │ SepalWidth │
+├─────┼─────────┼────┼─────────────┼────────────┼─────────────┼────────────┤
+│ 1 │ setosa │ 1 │ 1.4 │ 0.2 │ 5.1 │ 3.5 │
+│ 2 │ setosa │ 2 │ 1.4 │ 0.2 │ 4.9 │ 3.0 │
+│ 3 │ setosa │ 3 │ 1.3 │ 0.2 │ 4.7 │ 3.2 │
+│ 4 │ setosa │ 4 │ 1.5 │ 0.2 │ 4.6 │ 3.1 │
+│ 5 │ setosa │ 5 │ 1.4 │ 0.2 │ 5.0 │ 3.6 │
+│ 6 │ setosa │ 6 │ 1.7 │ 0.4 │ 5.4 │ 3.9 │
+
```
`stackdf` and `meltdf` are two additional functions that work like `stack` and `melt`, but they provide a view into the original wide DataFrame. Here is an example:
-```julia
-d = stackdf(iris)
+```jldoctest reshape
+julia> d = stackdf(iris);
+
+julia> head(d)
+6×4 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │ id │
+├─────┼─────────────┼───────┼─────────┼────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │ 1 │
+│ 2 │ SepalLength │ 4.9 │ setosa │ 2 │
+│ 3 │ SepalLength │ 4.7 │ setosa │ 3 │
+│ 4 │ SepalLength │ 4.6 │ setosa │ 4 │
+│ 5 │ SepalLength │ 5.0 │ setosa │ 5 │
+│ 6 │ SepalLength │ 5.4 │ setosa │ 6 │
+
+julia> tail(d)
+6×4 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │ id │
+├─────┼────────────┼───────┼───────────┼─────┤
+│ 1 │ PetalWidth │ 2.5 │ virginica │ 145 │
+│ 2 │ PetalWidth │ 2.3 │ virginica │ 146 │
+│ 3 │ PetalWidth │ 1.9 │ virginica │ 147 │
+│ 4 │ PetalWidth │ 2.0 │ virginica │ 148 │
+│ 5 │ PetalWidth │ 2.3 │ virginica │ 149 │
+│ 6 │ PetalWidth │ 1.8 │ virginica │ 150 │
+
```
This saves memory. To create the view, several AbstractVectors are defined:
@@ -70,16 +274,43 @@ This is provides a view of the original columns stacked together.
Id columns -- `RepeatedVector`
This repeats the original columns N times where N is the number of columns stacked.
-For more details on the storage representation, see:
+None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example:
-```julia
-dump(stackdf(iris))
-```
+```jldoctest reshape
+julia> d = melt(iris, :Species);
-None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example:
+julia> head(d)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ value │ Species │
+├─────┼─────────────┼───────┼─────────┤
+│ 1 │ SepalLength │ 5.1 │ setosa │
+│ 2 │ SepalLength │ 4.9 │ setosa │
+│ 3 │ SepalLength │ 4.7 │ setosa │
+│ 4 │ SepalLength │ 4.6 │ setosa │
+│ 5 │ SepalLength │ 5.0 │ setosa │
+│ 6 │ SepalLength │ 5.4 │ setosa │
+
+julia> x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(df[:value])));
+
+julia> head(x)
+6×3 DataFrames.DataFrame
+│ Row │ variable │ Species │ vsum │
+├─────┼─────────────┼────────────┼───────┤
+│ 1 │ SepalLength │ setosa │ 5.006 │
+│ 2 │ SepalLength │ versicolor │ 5.936 │
+│ 3 │ SepalLength │ virginica │ 6.588 │
+│ 4 │ SepalWidth │ setosa │ 3.428 │
+│ 5 │ SepalWidth │ versicolor │ 2.77 │
+│ 6 │ SepalWidth │ virginica │ 2.974 │
+
+julia> head(unstack(x, :Species, :vsum))
+5×4 DataFrames.DataFrame
+│ Row │ variable │ setosa │ versicolor │ virginica │
+├─────┼─────────────┼────────┼────────────┼───────────┤
+│ 1 │ PetalLength │ 1.462 │ 4.26 │ 5.552 │
+│ 2 │ PetalWidth │ 0.246 │ 1.326 │ 2.026 │
+│ 3 │ SepalLength │ 5.006 │ 5.936 │ 6.588 │
+│ 4 │ SepalWidth │ 3.428 │ 2.77 │ 2.974 │
+│ 5 │ id │ 25.5 │ 75.5 │ 125.5 │
-```julia
-d = stack(iris)
-x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(Nulls.skip(df[:value]))))
-unstack(x, :Species, :vsum)
```
diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md
index 68f3db2683..e625e14830 100644
--- a/docs/src/man/sorting.md
+++ b/docs/src/man/sorting.md
@@ -2,21 +2,113 @@
Sorting is a fundamental component of data analysis. Basic sorting is trivial: just calling `sort!` will sort all columns, in place:
-```julia
-using DataFrames
-using CSV
-iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame)
-sort!(iris)
+```jldoctest sort
+julia> using DataFrames, CSV
+
+julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"));
+
+julia> sort!(iris);
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 4.3 │ 3.0 │ 1.1 │ 0.1 │ setosa │
+│ 2 │ 4.4 │ 2.9 │ 1.4 │ 0.2 │ setosa │
+│ 3 │ 4.4 │ 3.0 │ 1.3 │ 0.2 │ setosa │
+│ 4 │ 4.4 │ 3.2 │ 1.3 │ 0.2 │ setosa │
+│ 5 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │
+│ 6 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 7.6 │ 3.0 │ 6.6 │ 2.1 │ virginica │
+│ 2 │ 7.7 │ 2.6 │ 6.9 │ 2.3 │ virginica │
+│ 3 │ 7.7 │ 2.8 │ 6.7 │ 2.0 │ virginica │
+│ 4 │ 7.7 │ 3.0 │ 6.1 │ 2.3 │ virginica │
+│ 5 │ 7.7 │ 3.8 │ 6.7 │ 2.2 │ virginica │
+│ 6 │ 7.9 │ 3.8 │ 6.4 │ 2.0 │ virginica │
+
```
In Sorting DataFrames, you may want to sort different columns with different options. Here are some examples showing most of the possible options:
-```julia
-sort!(iris, rev = true)
-sort!(iris, cols = [:SepalWidth, :SepalLength])
+```jldoctest sort
+julia> sort!(iris, rev = true);
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 7.9 │ 3.8 │ 6.4 │ 2.0 │ virginica │
+│ 2 │ 7.7 │ 3.8 │ 6.7 │ 2.2 │ virginica │
+│ 3 │ 7.7 │ 3.0 │ 6.1 │ 2.3 │ virginica │
+│ 4 │ 7.7 │ 2.8 │ 6.7 │ 2.0 │ virginica │
+│ 5 │ 7.7 │ 2.6 │ 6.9 │ 2.3 │ virginica │
+│ 6 │ 7.6 │ 3.0 │ 6.6 │ 2.1 │ virginica │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │
+│ 2 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │
+│ 3 │ 4.4 │ 3.2 │ 1.3 │ 0.2 │ setosa │
+│ 4 │ 4.4 │ 3.0 │ 1.3 │ 0.2 │ setosa │
+│ 5 │ 4.4 │ 2.9 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 4.3 │ 3.0 │ 1.1 │ 0.1 │ setosa │
+
+julia> sort!(iris, cols = [:SepalWidth, :SepalLength]);
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼────────────┤
+│ 1 │ 5.0 │ 2.0 │ 3.5 │ 1.0 │ versicolor │
+│ 2 │ 6.0 │ 2.2 │ 5.0 │ 1.5 │ virginica │
+│ 3 │ 6.0 │ 2.2 │ 4.0 │ 1.0 │ versicolor │
+│ 4 │ 6.2 │ 2.2 │ 4.5 │ 1.5 │ versicolor │
+│ 5 │ 4.5 │ 2.3 │ 1.3 │ 0.3 │ setosa │
+│ 6 │ 5.0 │ 2.3 │ 3.3 │ 1.0 │ versicolor │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │
+│ 2 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │
+│ 3 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │
+│ 4 │ 5.2 │ 4.1 │ 1.5 │ 0.1 │ setosa │
+│ 5 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │
+
+julia> sort!(iris, cols = [order(:Species, by = uppercase),
+ order(:SepalLength, rev = true)]);
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │
+│ 2 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │
+│ 3 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │
+│ 4 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │
+│ 5 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 5.4 │ 3.4 │ 1.7 │ 0.2 │ setosa │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 2 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 3 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │
+│ 4 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │
+│ 5 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │
+│ 6 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │
-sort!(iris, cols = [order(:Species, by = uppercase),
- order(:SepalLength, rev = true)])
```
Keywords used above include `cols` (to specify columns), `rev` (to sort a column or the whole DataFrame in reverse), and `by` (to apply a function to a column/DataFrame). Each keyword can either be a single value, or can be a tuple or array, with values corresponding to individual columns.
@@ -25,9 +117,54 @@ As an alternative to using array or tuple values, `order` to specify an ordering
The following two examples show two ways to sort the `iris` dataset with the same result: `Species` will be ordered in reverse lexicographic order, and within species, rows will be sorted by increasing sepal length and width:
-```julia
-sort!(iris, cols = (:Species, :SepalLength, :SepalWidth),
- rev = (true, false, false))
+```jldoctest sort
+julia> sort!(iris, cols = (:Species, :SepalLength, :SepalWidth),
+ rev = (true, false, false));
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │
+│ 2 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │
+│ 3 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │
+│ 4 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 5 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 6 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │
+│ 2 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │
+│ 3 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │
+│ 4 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │
+│ 5 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │
+│ 6 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │
+
+julia> sort!(iris, cols = (order(:Species, rev = true), :SepalLength, :SepalWidth));
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ virginica │
+│ 2 │ 5.6 │ 2.8 │ 4.9 │ 2.0 │ virginica │
+│ 3 │ 5.7 │ 2.5 │ 5.0 │ 2.0 │ virginica │
+│ 4 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 5 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │
+│ 6 │ 5.8 │ 2.8 │ 5.1 │ 2.4 │ virginica │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.4 │ 3.9 │ 1.3 │ 0.4 │ setosa │
+│ 2 │ 5.5 │ 3.5 │ 1.3 │ 0.2 │ setosa │
+│ 3 │ 5.5 │ 4.2 │ 1.4 │ 0.2 │ setosa │
+│ 4 │ 5.7 │ 3.8 │ 1.7 │ 0.3 │ setosa │
+│ 5 │ 5.7 │ 4.4 │ 1.5 │ 0.4 │ setosa │
+│ 6 │ 5.8 │ 4.0 │ 1.2 │ 0.2 │ setosa │
-sort!(iris, cols = (order(:Species, rev = true), :SepalLength, :SepalWidth))
```
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 78fb637c39..2f87690b99 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -6,37 +6,104 @@ The DataFrames package supports the Split-Apply-Combine strategy through the `by
We show several examples of the `by` function applied to the `iris` dataset below:
-```julia
-using DataFrames
-using CSV
-iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), DataFrame)
-
-by(iris, :Species, size)
-by(iris, :Species, df -> mean(Nulls.skip(df[:PetalLength])))
-by(iris, :Species, df -> DataFrame(N = size(df, 1)))
+```jldoctest sac
+julia> using DataFrames, CSV
+
+julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"));
+
+julia> head(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────┤
+│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │
+│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │
+│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │
+│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │
+│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │
+│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │
+
+julia> tail(iris)
+6×5 DataFrames.DataFrame
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼───────────┤
+│ 1 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │
+│ 2 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │
+│ 3 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │
+│ 4 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │
+│ 5 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │
+│ 6 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │
+
+
+julia> by(iris, :Species, size)
+3×2 DataFrames.DataFrame
+│ Row │ Species │ x1 │
+├─────┼────────────┼─────────┤
+│ 1 │ setosa │ (50, 5) │
+│ 2 │ versicolor │ (50, 5) │
+│ 3 │ virginica │ (50, 5) │
+
+julia> by(iris, :Species, df -> mean(df[:PetalLength]))
+3×2 DataFrames.DataFrame
+│ Row │ Species │ x1 │
+├─────┼────────────┼───────┤
+│ 1 │ setosa │ 1.462 │
+│ 2 │ versicolor │ 4.26 │
+│ 3 │ virginica │ 5.552 │
+
+julia> by(iris, :Species, df -> DataFrame(N = size(df, 1)))
+3×2 DataFrames.DataFrame
+│ Row │ Species │ N │
+├─────┼────────────┼────┤
+│ 1 │ setosa │ 50 │
+│ 2 │ versicolor │ 50 │
+│ 3 │ virginica │ 50 │
+
```
The `by` function also support the `do` block form:
-```julia
-by(iris, :Species) do df
- DataFrame(m = mean(Nulls.skip(df[:PetalLength])), s² = var(Nulls.skip(df[:PetalLength])))
-end
+```jldoctest sac
+julia> by(iris, :Species) do df
+ DataFrame(m = mean(df[:PetalLength]), s² = var(df[:PetalLength]))
+ end
+3×3 DataFrames.DataFrame
+│ Row │ Species │ m │ s² │
+├─────┼────────────┼───────┼───────────┤
+│ 1 │ setosa │ 1.462 │ 0.0301592 │
+│ 2 │ versicolor │ 4.26 │ 0.220816 │
+│ 3 │ virginica │ 5.552 │ 0.304588 │
+
```
-A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column, that was not used to split the DataFrame, creating new columns of the form `$name_$function` e.g. `SepalLength_mean`. Anonymous functions and expressions that do not have a name will be called `λ1`.
+A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column that was not used to split the DataFrame, creating new columns of the form `$name_$function`. For named functions like `mean` this will produce columns with names like `SepalLength_mean`. For anonymous functions like `x -> sqrt(x)^e`, which Julia tracks and references by a numerical identifier e.g. `#12`, the produced columns will be `SepalLength_#12`. We show several examples of the `aggregate` function applied to the `iris` dataset below:
-We show several examples of the `aggregate` function applied to the `iris` dataset below:
+```jldoctest sac
+julia> aggregate(iris, :Species, length)
+3×5 DataFrames.DataFrame
+│ Row │ Species │ SepalLength_length │ SepalWidth_length │ PetalLength_length │ PetalWidth_length │
+├─────┼────────────┼────────────────────┼───────────────────┼────────────────────┼───────────────────┤
+│ 1 │ setosa │ 50 │ 50 │ 50 │ 50 │
+│ 2 │ versicolor │ 50 │ 50 │ 50 │ 50 │
+│ 3 │ virginica │ 50 │ 50 │ 50 │ 50 │
+
+julia> aggregate(iris, :Species, [sum, mean])
+3×9 DataFrames.DataFrame
+│ Row │ Species │ SepalLength_sum │ SepalWidth_sum │ PetalLength_sum │ PetalWidth_sum │ SepalLength_mean │ SepalWidth_mean │ PetalLength_mean │ PetalWidth_mean │
+├─────┼────────────┼─────────────────┼────────────────┼─────────────────┼────────────────┼──────────────────┼─────────────────┼──────────────────┼─────────────────┤
+│ 1 │ setosa │ 250.3 │ 171.4 │ 73.1 │ 12.3 │ 5.006 │ 3.428 │ 1.462 │ 0.246 │
+│ 2 │ versicolor │ 296.8 │ 138.5 │ 213.0 │ 66.3 │ 5.936 │ 2.77 │ 4.26 │ 1.326 │
+│ 3 │ virginica │ 329.4 │ 148.7 │ 277.6 │ 101.3 │ 6.588 │ 2.974 │ 5.552 │ 2.026 │
-```julia
-aggregate(iris, :Species, sum)
-aggregate(iris, :Species, [sum, x->mean(Nulls.skip(x))])
```
If you only want to split the data set into subsets, use the `groupby` function:
-```julia
-for subdf in groupby(iris, :Species)
- println(size(subdf, 1))
-end
+```jldoctest sac
+julia> for subdf in groupby(iris, :Species)
+ println(size(subdf, 1))
+ end
+50
+50
+50
+
```
diff --git a/docs/src/man/subsets.md b/docs/src/man/subsets.md
index 29f4295dd6..b20d4ce189 100644
--- a/docs/src/man/subsets.md
+++ b/docs/src/man/subsets.md
@@ -2,7 +2,7 @@
A `DataFrame` supports many forms of indexing.
-```julia
+```jldoctest subsets
julia> using DataFrames
julia> df = DataFrame(A = 1:10, B = 2:2:20)
@@ -19,11 +19,12 @@ julia> df = DataFrame(A = 1:10, B = 2:2:20)
│ 8 │ 8 │ 16 │
│ 9 │ 9 │ 18 │
│ 10 │ 10 │ 20 │
+
```
Referring to the first column by index or name:
-```julia
+```jldoctest subsets
julia> df[1]
10-element Array{Int64,1}:
1
@@ -49,21 +50,23 @@ julia> df[:A]
8
9
10
+
```
Refering to the first element of the first column:
-```julia
+```jldoctest subsets
julia> df[1, 1]
1
julia> df[1, :A]
1
+
```
Selecting a subset of rows by index and an (ordered) subset of columns by name:
-```julia
+```jldoctest subsets
julia> df[1:3, [:A, :B]]
3×2 DataFrames.DataFrame
│ Row │ A │ B │
@@ -79,4 +82,5 @@ julia> df[1:3, [:B, :A]]
│ 1 │ 2 │ 1 │
│ 2 │ 4 │ 2 │
│ 3 │ 6 │ 3 │
+
```
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index 2988c2610a..16c23d2239 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -49,7 +49,6 @@ export AbstractDataFrame,
nrow,
nullable!,
order,
- printtable,
rename!,
rename,
showcols,
diff --git a/src/other/utils.jl b/src/other/utils.jl
index 8aba353e1a..bf2940d9eb 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -82,18 +82,12 @@ function make_unique(names::Vector{Symbol}; allow_duplicates=true)
return names
end
-#' @description
-#'
-#' Generate standardized names for columns of a DataFrame. The
-#' first name will be :x1, the second :x2, etc.
-#'
-#' @field n::Integer The number of names to generate.
-#'
-#' @returns names::Vector{Symbol} A vector of standardized column names.
-#'
-#' @examples
-#'
-#' DataFrames.gennames(10)
+"""
+ gennames(n::Integer)
+
+Generate standardized names for columns of a DataFrame. The first name will be `:x1`, the
+second `:x2`, etc.
+"""
function gennames(n::Integer)
res = Array{Symbol}(n)
for i in 1:n
@@ -103,17 +97,11 @@ function gennames(n::Integer)
end
-#' @description
-#'
-#' Count the number of null values in an array.
-#'
-#' @field a::AbstractArray The array whose missing values are to be counted.
-#'
-#' @returns count::Int The number of null values in `a`.
-#'
-#' @examples
-#'
-#' DataFrames.countnull([1, 2, 3])
+"""
+ countnull(a::AbstractArray)
+
+Count the number of `null` values in an array.
+"""
function countnull(a::AbstractArray)
res = 0
for x in a
@@ -122,18 +110,6 @@ function countnull(a::AbstractArray)
return res
end
-#' @description
-#'
-#' Count the number of missing values in a CategoricalArray.
-#'
-#' @field na::CategoricalArray The CategoricalArray whose missing values
-#' are to be counted.
-#'
-#' @returns count::Int The number of null values in `a`.
-#'
-#' @examples
-#'
-#' DataFrames.countnull(CategoricalArray([1, 2, 3]))
function countnull(a::CategoricalArray)
res = 0
for x in a.refs
@@ -155,4 +131,3 @@ function _fnames(fs::Vector{T}) where T<:Function
end
names
end
-
diff --git a/test/io.jl b/test/io.jl
index cd527946db..001026e42a 100644
--- a/test/io.jl
+++ b/test/io.jl
@@ -50,7 +50,7 @@ module TestIO
G = nulls(3),
H = fill(null, 3))
- @test sprint(printtable, df) ==
+ @test sprint(DataFrames.printtable, df) ==
"""
"A","B","C","D","E","F","G","H"
1,"'a'","A","a","A","1",null,null