Skip to content

Commit

Permalink
Add DF describe (#444)
Browse files Browse the repository at this point in the history
  • Loading branch information
Cristine Guadelupe authored Dec 2, 2022
1 parent 9eb2428 commit 710827b
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 0 deletions.
1 change: 1 addition & 0 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ defmodule Explorer.Backend.DataFrame do
values_to :: column_name()
) :: df
@callback put(df, out_df :: df(), column_name(), series()) :: df
@callback describe(df, out_df :: df()) :: df()

# Two or more table verbs

Expand Down
25 changes: 25 additions & 0 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4389,6 +4389,31 @@ defmodule Explorer.DataFrame do
|> IO.puts()
end

@doc """
Describe numeric columns of a DataFrame.
Groups are ignored if the dataframe is using any.
## Examples
iex> df = DF.new(a: ["d", nil, "f"], b: [1, 2, 3], c: ["a", "b", "c"])
iex> Explorer.DataFrame.describe(df)
#Explorer.DataFrame<
Polars[8 x 4]
describe string ["count", "mean", "std", "min", "25%", ...]
a float [3.0, nil, nil, nil, nil, ...]
b float [3.0, 2.0, 1.0, 1.0, 1.5, ...]
c float [3.0, nil, nil, nil, nil, ...]
>
"""
@doc type: :single
@spec describe(df :: DataFrame.t()) :: DataFrame.t()
def describe(df) do
types = for name <- df.names, into: %{"describe" => :string}, do: {name, :float}
out_df = %{df | names: ["describe" | df.names], dtypes: types, groups: []}
Shared.apply_impl(df, :describe, [out_df])
end

# Helpers

defp backend_from_options!(opts) do
Expand Down
5 changes: 5 additions & 0 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
Shared.apply_dataframe(df, out_df, :df_put_column, [series.data])
end

@impl true
def describe(%DataFrame{} = df, %DataFrame{} = out_df) do
Shared.apply_dataframe(df, out_df, :df_describe, [])
end

@impl true
def arrange_with(%DataFrame{} = df, out_df, column_pairs) do
{directions, expressions} =
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ defmodule Explorer.PolarsBackend.Native do
def df_to_ndjson(_df, _filename), do: err()
def df_to_parquet(_df, _filename, _compression, _compression_level), do: err()
def df_width(_df), do: err()
def df_describe(_df), do: err()

# Expressions (for lazy queries)
# We first generate functions for known operations.
Expand Down
1 change: 1 addition & 0 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ features = [
"cross_join",
"cum_agg",
"decompress",
"describe",
"dtype-date",
"dtype-datetime",
"dtype-binary",
Expand Down
8 changes: 8 additions & 0 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,14 @@ pub fn df_put_column(data: ExDataFrame, series: ExSeries) -> Result<ExDataFrame,
Ok(ExDataFrame::new(new_df.clone()))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_describe(data: ExDataFrame) -> Result<ExDataFrame, ExplorerError> {
let df: DataFrame = data.resource.0.clone();
let new_df = df.describe(None);

Ok(ExDataFrame::new(new_df))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_mutate_with_exprs(
data: ExDataFrame,
Expand Down
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ rustler::init!(
df_arrange_with,
df_concat_columns,
df_concat_rows,
df_describe,
df_distinct,
df_drop,
df_drop_nulls,
Expand Down
12 changes: 12 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2034,4 +2034,16 @@ defmodule Explorer.DataFrameTest do
fn -> DF.put(df, :e, Nx.tensor([1, 2, 3], type: {:u, 32})) end
end
end

test "describe/1" do
df = DF.new(a: ["d", nil, "f"], b: [1, 2, 3], c: ["a", "b", "c"])
df1 = DF.describe(df)

assert DF.to_columns(df1, atom_keys: true) == %{
a: [3.0, nil, nil, nil, nil, nil, nil, nil],
b: [3.0, 2.0, 1.0, 1.0, 1.5, 2.0, 2.5, 3.0],
c: [3.0, nil, nil, nil, nil, nil, nil, nil],
describe: ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
}
end
end

0 comments on commit 710827b

Please sign in to comment.