Skip to content

Commit

Permalink
Add some LazyFrame functions - part III (#488)
Browse files Browse the repository at this point in the history
* Add LazyFrame impl for DF.rename

It rename columns for our lazy frame.

* Add "DF.drop_nil/2" to LazyFrame

* Add `DF.pivot_longer/3` to LazyFrame impl

This is basically equal to the eager frame.
  • Loading branch information
Philip Sampaio authored Jan 29, 2023
1 parent 74821c6 commit f9646a7
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 4 deletions.
2 changes: 2 additions & 0 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2779,6 +2779,8 @@ defmodule Explorer.DataFrame do

def drop_nil(df, column) when is_column(column), do: drop_nil(df, [column])

def drop_nil(df, []), do: df

def drop_nil(df, columns) do
columns = to_existing_columns(df, columns)

Expand Down
2 changes: 1 addition & 1 deletion lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
do: Shared.apply_dataframe(df, df, :df_slice, [offset, length, df.groups])

@impl true
def drop_nil(df, columns), do: Shared.apply_dataframe(df, df, :df_drop_nulls, [columns])
def drop_nil(df, columns), do: Shared.apply_dataframe(df, df, :df_drop_nils, [columns])

@impl true
def pivot_longer(df, out_df, columns_to_pivot, columns_to_keep, names_to, values_to) do
Expand Down
20 changes: 20 additions & 0 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,26 @@ defmodule Explorer.PolarsBackend.LazyFrame do
raise "mutate_with/2 with groups is not supported yet for lazy frames"
end

@impl true
def rename(%DF{} = df, %DF{} = out_df, pairs),
do: Shared.apply_dataframe(df, out_df, :lf_rename_columns, [pairs])

@impl true
def drop_nil(%DF{} = df, columns) do
exprs = for col <- columns, do: Native.expr_column(col)
Shared.apply_dataframe(df, df, :lf_drop_nils, [exprs])
end

@impl true
def pivot_longer(%DF{} = df, %DF{} = out_df, cols_to_pivot, cols_to_keep, names_to, values_to),
do:
Shared.apply_dataframe(df, out_df, :lf_pivot_longer, [
cols_to_keep,
cols_to_pivot,
names_to,
values_to
])

# Groups

@impl true
Expand Down
5 changes: 4 additions & 1 deletion lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ defmodule Explorer.PolarsBackend.Native do
def df_concat_rows(_df, _others), do: err()
def df_distinct(_df, _subset, _selection), do: err()
def df_drop(_df, _name), do: err()
def df_drop_nulls(_df, _subset), do: err()
def df_drop_nils(_df, _subset), do: err()
def df_dtypes(_df), do: err()
def df_dump_csv(_df, _has_headers, _delimiter), do: err()
def df_dump_ndjson(_df), do: err()
Expand Down Expand Up @@ -177,6 +177,9 @@ defmodule Explorer.PolarsBackend.Native do
def lf_distinct(_df, _subset, _selection), do: err()
def lf_mutate_with(_df, _exprs), do: err()
def lf_summarise_with(_df, _groups, _aggs), do: err()
def lf_rename_columns(_df, _column_pairs), do: err()
def lf_drop_nils(_df, _column_pairs), do: err()
def lf_pivot_longer(_df, _id_vars, _value_vars, _names_to, _values_to), do: err()

# Series
def s_add(_s, _other), do: err()
Expand Down
2 changes: 1 addition & 1 deletion native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ pub fn df_concat_columns(
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_drop_nulls(
pub fn df_drop_nils(
data: ExDataFrame,
subset: Option<Vec<String>>,
) -> Result<ExDataFrame, ExplorerError> {
Expand Down
41 changes: 41 additions & 0 deletions native/explorer/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,44 @@ pub fn lf_summarise_with(
let new_df = ldf.groupby_stable(groups).agg(aggs);
Ok(ExLazyFrame::new(new_df))
}

#[rustler::nif]
pub fn lf_rename_columns(
data: ExLazyFrame,
renames: Vec<(&str, &str)>,
) -> Result<ExLazyFrame, ExplorerError> {
let df = data.clone_inner();
let (existing, new): (Vec<_>, Vec<_>) = renames.iter().cloned().unzip();

Ok(ExLazyFrame::new(df.rename(existing, new)))
}

#[rustler::nif]
pub fn lf_drop_nils(
data: ExLazyFrame,
subset: Option<Vec<ExExpr>>,
) -> Result<ExLazyFrame, ExplorerError> {
let ldf = data.clone_inner();
let columns = subset.map(ex_expr_to_exprs);

Ok(ExLazyFrame::new(ldf.drop_nulls(columns)))
}

#[rustler::nif]
pub fn lf_pivot_longer(
data: ExLazyFrame,
id_vars: Vec<String>,
value_vars: Vec<String>,
names_to: String,
values_to: String,
) -> Result<ExLazyFrame, ExplorerError> {
let ldf = data.clone_inner();
let melt_opts = MeltArgs {
id_vars,
value_vars,
variable_name: Some(names_to),
value_name: Some(values_to),
};
let new_df = ldf.melt(melt_opts);
Ok(ExLazyFrame::new(new_df))
}
5 changes: 4 additions & 1 deletion native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ rustler::init!(
df_describe,
df_distinct,
df_drop,
df_drop_nulls,
df_drop_nils,
df_dtypes,
df_dump_csv,
df_dump_ndjson,
Expand Down Expand Up @@ -230,6 +230,9 @@ rustler::init!(
lf_distinct,
lf_mutate_with,
lf_summarise_with,
lf_rename_columns,
lf_drop_nils,
lf_pivot_longer,
// series
s_add,
s_and,
Expand Down
144 changes: 144 additions & 0 deletions test/explorer/data_frame/lazy_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -582,4 +582,148 @@ defmodule Explorer.DataFrame.LazyTest do
}
end
end

describe "rename/2" do
test "renames a column" do
ldf = DF.new([a: [1, 2, 3], b: ["a", "b", "c"]], lazy: true)

ldf1 = DF.rename(ldf, %{"a" => "ids"})

assert ldf1.names == ["ids", "b"]
assert ldf1.dtypes == %{"ids" => :integer, "b" => :string}

df = DF.collect(ldf1)

assert DF.to_columns(df, atom_keys: true) == %{
ids: [1, 2, 3],
b: ["a", "b", "c"]
}

assert ldf1.names == df.names
assert ldf1.dtypes == df.dtypes
end
end

describe "drop_nils/2" do
test "considering all columns" do
ldf = DF.new([a: [1, 2, nil], b: [1, nil, 3]], lazy: true)

ldf1 = DF.drop_nil(ldf)

df = DF.collect(ldf1)
assert DF.to_columns(df) == %{"a" => [1], "b" => [1]}
end

test "considering one column" do
ldf = DF.new([a: [1, 2, nil], b: [1, nil, 3]], lazy: true)

ldf1 = DF.drop_nil(ldf, :a)

df = DF.collect(ldf1)
assert DF.to_columns(df) == %{"a" => [1, 2], "b" => [1, nil]}
end

test "selecting none" do
ldf = DF.new([a: [1, 2, nil], b: [1, nil, 3]], lazy: true)

ldf1 = DF.drop_nil(ldf, [])

df = DF.collect(ldf1)
assert DF.to_columns(df) == %{"a" => [1, 2, nil], "b" => [1, nil, 3]}
end
end

describe "pivot_longer/3" do
test "without selecting columns", %{ldf: ldf} do
ldf = DF.pivot_longer(ldf, &String.ends_with?(&1, "fuel"), select: [])

assert ldf.names == ["variable", "value"]
assert ldf.dtypes == %{"variable" => :string, "value" => :integer}

df = DF.collect(ldf)
assert DF.shape(df) == {3282, 2}
assert ldf.names == df.names
assert ldf.dtypes == df.dtypes
end

test "selecting some columns", %{ldf: ldf} do
ldf = DF.pivot_longer(ldf, &String.ends_with?(&1, "fuel"), select: ["year", "country"])

assert ldf.names == ["year", "country", "variable", "value"]

assert ldf.dtypes == %{
"year" => :integer,
"country" => :string,
"variable" => :string,
"value" => :integer
}

df = DF.collect(ldf)

assert DF.shape(df) == {3282, 4}
assert ldf.names == df.names
assert ldf.dtypes == df.dtypes
end

test "selecting all the columns (not passing select option)", %{ldf: ldf} do
ldf = DF.pivot_longer(ldf, &String.ends_with?(&1, ["fuel", "fuels"]))

assert ldf.names == [
"year",
"country",
"total",
"cement",
"gas_flaring",
"per_capita",
"variable",
"value"
]

df = DF.collect(ldf)

assert DF.shape(df) == {4376, 8}
assert ldf.names == df.names
assert ldf.dtypes == df.dtypes
end

test "dropping some columns", %{ldf: ldf} do
ldf =
DF.pivot_longer(ldf, &String.ends_with?(&1, ["fuel", "fuels"]),
discard: ["gas_flaring", "cement"]
)

assert ldf.names == [
"year",
"country",
"total",
"per_capita",
"variable",
"value"
]

df = DF.collect(ldf)

assert ldf.names == df.names
assert ldf.dtypes == df.dtypes
end

test "select and discard with the same columns discards the columns", %{ldf: ldf} do
ldf =
DF.pivot_longer(ldf, &String.ends_with?(&1, ["fuel", "fuels"]),
select: ["gas_flaring", "cement"],
discard: fn name -> name == "cement" end
)

assert ldf.names == [
"gas_flaring",
"variable",
"value"
]

df = DF.collect(ldf)

assert ldf.names == df.names
assert ldf.dtypes == df.dtypes
end
end
end

0 comments on commit f9646a7

Please sign in to comment.