Add DF describe (#444)

elixir-explorer · Dec 2, 2022 · 710827b · 710827b
1 parent 9eb2428
commit 710827b
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 0 deletions.
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -163,6 +163,7 @@ defmodule Explorer.Backend.DataFrame do
               values_to :: column_name()
             ) :: df
   @callback put(df, out_df :: df(), column_name(), series()) :: df
+  @callback describe(df, out_df :: df()) :: df()
 
   # Two or more table verbs
 

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -4389,6 +4389,31 @@ defmodule Explorer.DataFrame do
     |> IO.puts()
   end
 
+  @doc """
+  Describe numeric columns of a DataFrame.
+
+  Groups are ignored if the dataframe is using any.
+
+  ## Examples
+
+      iex> df = DF.new(a: ["d", nil, "f"], b: [1, 2, 3], c: ["a", "b", "c"])
+      iex> Explorer.DataFrame.describe(df)
+      #Explorer.DataFrame<
+        Polars[8 x 4]
+        describe string ["count", "mean", "std", "min", "25%", ...]
+        a float [3.0, nil, nil, nil, nil, ...]
+        b float [3.0, 2.0, 1.0, 1.0, 1.5, ...]
+        c float [3.0, nil, nil, nil, nil, ...]
+      >
+  """
+  @doc type: :single
+  @spec describe(df :: DataFrame.t()) :: DataFrame.t()
+  def describe(df) do
+    types = for name <- df.names, into: %{"describe" => :string}, do: {name, :float}
+    out_df = %{df | names: ["describe" | df.names], dtypes: types, groups: []}
+    Shared.apply_impl(df, :describe, [out_df])
+  end
+
   # Helpers
 
   defp backend_from_options!(opts) do

diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -395,6 +395,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
     Shared.apply_dataframe(df, out_df, :df_put_column, [series.data])
   end
 
+  @impl true
+  def describe(%DataFrame{} = df, %DataFrame{} = out_df) do
+    Shared.apply_dataframe(df, out_df, :df_describe, [])
+  end
+
   @impl true
   def arrange_with(%DataFrame{} = df, out_df, column_pairs) do
     {directions, expressions} =

diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
@@ -104,6 +104,7 @@ defmodule Explorer.PolarsBackend.Native do
   def df_to_ndjson(_df, _filename), do: err()
   def df_to_parquet(_df, _filename, _compression, _compression_level), do: err()
   def df_width(_df), do: err()
+  def df_describe(_df), do: err()
 
   # Expressions (for lazy queries)
   # We first generate functions for known operations.

diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml
@@ -32,6 +32,7 @@ features = [
   "cross_join",
   "cum_agg",
   "decompress",
+  "describe",
   "dtype-date",
   "dtype-datetime",
   "dtype-binary",

diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -417,6 +417,14 @@ pub fn df_put_column(data: ExDataFrame, series: ExSeries) -> Result<ExDataFrame,
     Ok(ExDataFrame::new(new_df.clone()))
 }
 
+#[rustler::nif(schedule = "DirtyCpu")]
+pub fn df_describe(data: ExDataFrame) -> Result<ExDataFrame, ExplorerError> {
+    let df: DataFrame = data.resource.0.clone();
+    let new_df = df.describe(None);
+
+    Ok(ExDataFrame::new(new_df))
+}
+
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn df_mutate_with_exprs(
     data: ExDataFrame,

diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs
@@ -67,6 +67,7 @@ rustler::init!(
         df_arrange_with,
         df_concat_columns,
         df_concat_rows,
+        df_describe,
         df_distinct,
         df_drop,
         df_drop_nulls,

diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
@@ -2034,4 +2034,16 @@ defmodule Explorer.DataFrameTest do
                    fn -> DF.put(df, :e, Nx.tensor([1, 2, 3], type: {:u, 32})) end
     end
   end
+
+  test "describe/1" do
+    df = DF.new(a: ["d", nil, "f"], b: [1, 2, 3], c: ["a", "b", "c"])
+    df1 = DF.describe(df)
+
+    assert DF.to_columns(df1, atom_keys: true) == %{
+             a: [3.0, nil, nil, nil, nil, nil, nil, nil],
+             b: [3.0, 2.0, 1.0, 1.0, 1.5, 2.0, 2.5, 3.0],
+             c: [3.0, nil, nil, nil, nil, nil, nil, nil],
+             describe: ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+           }
+  end
 end