From 1dd8c4e0717f82fc3d5e7f71bb5153a002d27e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:37:22 +0100 Subject: [PATCH 01/11] Rewrite joins page. [skip ci] --- docs/source/_build/API_REFERENCE_LINKS.yml | 2 + .../user-guide/transformations/_joins.py | 162 ++++++++++ .../user-guide/transformations/joins.py | 210 ++++++------- .../rust/user-guide/transformations/_joins.rs | 243 +++++++++++++++ .../rust/user-guide/transformations/joins.rs | 292 ++++-------------- .../user-guide/transformations/joins.md | 271 ++++++++-------- 6 files changed, 705 insertions(+), 475 deletions(-) create mode 100644 docs/source/src/python/user-guide/transformations/_joins.py create mode 100644 docs/source/src/rust/user-guide/transformations/_joins.rs diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 1fea34db3465..41eb9a53bd8f 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -102,6 +102,7 @@ python: name: execute link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html + join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html @@ -194,6 +195,7 @@ rust: unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample join_asof: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof + join_where: https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest read_csv: diff --git a/docs/source/src/python/user-guide/transformations/_joins.py b/docs/source/src/python/user-guide/transformations/_joins.py new file mode 100644 index 000000000000..a34ea310e614 --- /dev/null +++ b/docs/source/src/python/user-guide/transformations/_joins.py @@ -0,0 +1,162 @@ +# --8<-- [start:setup] +import polars as pl +from datetime import datetime + +# --8<-- [end:setup] + +# --8<-- [start:innerdf] +df_customers = pl.DataFrame( + { + "customer_id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + } +) +print(df_customers) +# --8<-- [end:innerdf] + +# --8<-- [start:innerdf2] +df_orders = pl.DataFrame( + { + "order_id": ["a", "b", "c"], + "customer_id": [1, 2, 2], + "amount": [100, 200, 300], + } +) +print(df_orders) +# --8<-- [end:innerdf2] + + +# --8<-- [start:inner] +df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner") +print(df_inner_customer_join) +# --8<-- [end:inner] + +# --8<-- [start:left] +df_left_join = df_customers.join(df_orders, on="customer_id", how="left") +print(df_left_join) +# --8<-- [end:left] + +# --8<-- [start:right] +df_right_join = df_orders.join(df_customers, on="customer_id", how="right") +print(df_right_join) +# --8<-- [end:right] + +# --8<-- [start:full] +df_outer_join = df_customers.join(df_orders, on="customer_id", how="full") +print(df_outer_join) +# --8<-- [end:full] + +# --8<-- [start:full_coalesce] +df_outer_coalesce_join = df_customers.join( + df_orders, on="customer_id", how="full", coalesce=True +) +print(df_outer_coalesce_join) +# --8<-- [end:full_coalesce] + +# --8<-- [start:df3] +df_colors = pl.DataFrame( + { + "color": ["red", "blue", "green"], + } +) +print(df_colors) +# --8<-- [end:df3] + +# --8<-- [start:df4] +df_sizes = pl.DataFrame( + { + "size": ["S", "M", "L"], + } +) +print(df_sizes) +# --8<-- [end:df4] + +# --8<-- [start:cross] +df_cross_join = df_colors.join(df_sizes, how="cross") +print(df_cross_join) +# --8<-- [end:cross] + +# --8<-- [start:df5] +df_cars = pl.DataFrame( + { + "id": ["a", "b", "c"], + "make": ["ford", "toyota", "bmw"], + } +) +print(df_cars) +# --8<-- [end:df5] + +# --8<-- [start:df6] +df_repairs = pl.DataFrame( + { + "id": ["c", "c"], + "cost": [100, 200], + } +) +print(df_repairs) +# --8<-- [end:df6] + +# --8<-- [start:inner2] +df_inner_join = df_cars.join(df_repairs, on="id", how="inner") +print(df_inner_join) +# --8<-- [end:inner2] + +# --8<-- [start:semi] +df_semi_join = df_cars.join(df_repairs, on="id", how="semi") +print(df_semi_join) +# --8<-- [end:semi] + +# --8<-- [start:anti] +df_anti_join = df_cars.join(df_repairs, on="id", how="anti") +print(df_anti_join) +# --8<-- [end:anti] + +# --8<-- [start:df7] +df_trades = pl.DataFrame( + { + "time": [ + datetime(2020, 1, 1, 9, 1, 0), + datetime(2020, 1, 1, 9, 1, 0), + datetime(2020, 1, 1, 9, 3, 0), + datetime(2020, 1, 1, 9, 6, 0), + ], + "stock": ["A", "B", "B", "C"], + "trade": [101, 299, 301, 500], + } +) +print(df_trades) +# --8<-- [end:df7] + +# --8<-- [start:df8] +df_quotes = pl.DataFrame( + { + "time": [ + datetime(2020, 1, 1, 9, 0, 0), + datetime(2020, 1, 1, 9, 2, 0), + datetime(2020, 1, 1, 9, 4, 0), + datetime(2020, 1, 1, 9, 6, 0), + ], + "stock": ["A", "B", "C", "A"], + "quote": [100, 300, 501, 102], + } +) + +print(df_quotes) +# --8<-- [end:df8] + +# --8<-- [start:asofpre] +df_trades = df_trades.sort("time") +df_quotes = df_quotes.sort("time") # Set column as sorted +# --8<-- [end:asofpre] + +# --8<-- [start:asof] +df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock") +print(df_asof_join) +# --8<-- [end:asof] + +# --8<-- [start:asof2] +df_asof_tolerance_join = df_trades.join_asof( + df_quotes, on="time", by="stock", tolerance="1m" +) +print(df_asof_tolerance_join) +# --8<-- [end:asof2] diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index a34ea310e614..fe967cf0bea0 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,117 +1,107 @@ -# --8<-- [start:setup] +# --8<-- [start:props_groups] import polars as pl -from datetime import datetime - -# --8<-- [end:setup] - -# --8<-- [start:innerdf] -df_customers = pl.DataFrame( - { - "customer_id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - } -) -print(df_customers) -# --8<-- [end:innerdf] - -# --8<-- [start:innerdf2] -df_orders = pl.DataFrame( - { - "order_id": ["a", "b", "c"], - "customer_id": [1, 2, 2], - "amount": [100, 200, 300], - } -) -print(df_orders) -# --8<-- [end:innerdf2] - -# --8<-- [start:inner] -df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner") -print(df_inner_customer_join) -# --8<-- [end:inner] +props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5) +print(props_groups) +# --8<-- [end:props_groups] -# --8<-- [start:left] -df_left_join = df_customers.join(df_orders, on="customer_id", how="left") -print(df_left_join) -# --8<-- [end:left] +# --8<-- [start:props_prices] +props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5) +print(props_prices) +# --8<-- [end:props_prices] -# --8<-- [start:right] -df_right_join = df_orders.join(df_customers, on="customer_id", how="right") -print(df_right_join) -# --8<-- [end:right] +# --8<-- [start:equi-join] +result = props_groups.join(props_prices, on="property_name") +print(result) +# --8<-- [end:equi-join] -# --8<-- [start:full] -df_outer_join = df_customers.join(df_orders, on="customer_id", how="full") -print(df_outer_join) -# --8<-- [end:full] +# --8<-- [start:props_groups2] +props_groups2 = props_groups.with_columns(pl.col("property_name").str.to_lowercase()) +print(props_groups2) +# --8<-- [end:props_groups2] -# --8<-- [start:full_coalesce] -df_outer_coalesce_join = df_customers.join( - df_orders, on="customer_id", how="full", coalesce=True -) -print(df_outer_coalesce_join) -# --8<-- [end:full_coalesce] - -# --8<-- [start:df3] -df_colors = pl.DataFrame( - { - "color": ["red", "blue", "green"], - } +# --8<-- [start:props_prices2] +props_prices2 = props_prices.select( + pl.col("property_name").alias("name"), pl.col("cost") ) -print(df_colors) -# --8<-- [end:df3] - -# --8<-- [start:df4] -df_sizes = pl.DataFrame( - { - "size": ["S", "M", "L"], - } +print(props_prices2) +# --8<-- [end:props_prices2] + +# --8<-- [start:join-key-expression] +result = props_groups2.join( + props_prices2, + left_on="property_name", + right_on=pl.col("name").str.to_lowercase(), ) -print(df_sizes) -# --8<-- [end:df4] - -# --8<-- [start:cross] -df_cross_join = df_colors.join(df_sizes, how="cross") -print(df_cross_join) -# --8<-- [end:cross] - -# --8<-- [start:df5] -df_cars = pl.DataFrame( - { - "id": ["a", "b", "c"], - "make": ["ford", "toyota", "bmw"], - } +print(result) +# --8<-- [end:join-key-expression] + +# --8<-- [start:inner-join] +result = props_groups.join(props_prices, on="property_name", how="inner") +print(result) +# --8<-- [end:inner-join] + +# --8<-- [start:left-join] +result = props_groups.join(props_prices, on="property_name", how="left") +print(result) +# --8<-- [end:left-join] + +# --8<-- [start:right-join] +result = props_groups.join(props_prices, on="property_name", how="right") +print(result) +# --8<-- [end:right-join] + +# --8<-- [start:left-right-join-equals] +print( + result.equals( + props_prices.join( + props_groups, + on="property_name", + how="left", + # Reorder the columns to match the order from above. + ).select(pl.col("group"), pl.col("property_name"), pl.col("cost")) + ) ) -print(df_cars) -# --8<-- [end:df5] - -# --8<-- [start:df6] -df_repairs = pl.DataFrame( +# --8<-- [end:left-right-join-equals] + +# --8<-- [start:full-join] +result = props_groups.join(props_prices, on="property_name", how="full") +print(result) +# --8<-- [end:full-join] + +# --8<-- [start:full-join-coalesce] +result = props_groups.join(props_prices, on="property_name", how="full", coalesce=True) +print(result) +# --8<-- [end:full-join-coalesce] + +# --8<-- [start:semi-join] +result = props_groups.join(props_prices, on="property_name", how="semi") +print(result) +# --8<-- [end:semi-join] + +# --8<-- [start:anti-join] +result = props_groups.join(props_prices, on="property_name", how="anti") +print(result) +# --8<-- [end:anti-join] + +# --8<-- [start:players] +players = pl.DataFrame( { - "id": ["c", "c"], - "cost": [100, 200], + "name": ["Alice", "Bob"], + "cash": [78, 135], } ) -print(df_repairs) -# --8<-- [end:df6] +print(players) +# --8<-- [end:players] -# --8<-- [start:inner2] -df_inner_join = df_cars.join(df_repairs, on="id", how="inner") -print(df_inner_join) -# --8<-- [end:inner2] +# --8<-- [start:non-equi] +result = players.join_where(props_prices, pl.col("cash") > pl.col("cost")) +print(result) +# --8<-- [end:non-equi] -# --8<-- [start:semi] -df_semi_join = df_cars.join(df_repairs, on="id", how="semi") -print(df_semi_join) -# --8<-- [end:semi] - -# --8<-- [start:anti] -df_anti_join = df_cars.join(df_repairs, on="id", how="anti") -print(df_anti_join) -# --8<-- [end:anti] +# --8<-- [start:df_trades] +from datetime import datetime -# --8<-- [start:df7] df_trades = pl.DataFrame( { "time": [ @@ -125,9 +115,9 @@ } ) print(df_trades) -# --8<-- [end:df7] +# --8<-- [end:df_trades] -# --8<-- [start:df8] +# --8<-- [start:df_quotes] df_quotes = pl.DataFrame( { "time": [ @@ -142,21 +132,23 @@ ) print(df_quotes) -# --8<-- [end:df8] - -# --8<-- [start:asofpre] -df_trades = df_trades.sort("time") -df_quotes = df_quotes.sort("time") # Set column as sorted -# --8<-- [end:asofpre] +# --8<-- [end:df_quotes] # --8<-- [start:asof] df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock") print(df_asof_join) # --8<-- [end:asof] -# --8<-- [start:asof2] +# --8<-- [start:asof-tolerance] df_asof_tolerance_join = df_trades.join_asof( df_quotes, on="time", by="stock", tolerance="1m" ) print(df_asof_tolerance_join) -# --8<-- [end:asof2] +# --8<-- [end:asof-tolerance] + +# --8<-- [start:cartesian-product] +tokens = pl.DataFrame({"monopoly_token": ["hat", "shoe", "boat"]}) + +result = players.select(pl.col("name")).join(tokens, how="cross") +print(result) +# --8<-- [end:cartesian-product] diff --git a/docs/source/src/rust/user-guide/transformations/_joins.rs b/docs/source/src/rust/user-guide/transformations/_joins.rs new file mode 100644 index 000000000000..5caa0cc4ac18 --- /dev/null +++ b/docs/source/src/rust/user-guide/transformations/_joins.rs @@ -0,0 +1,243 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:innerdf] + let df_customers = df! ( + + "customer_id" => &[1, 2, 3], + "name" => &["Alice", "Bob", "Charlie"], + )?; + + println!("{}", &df_customers); + // --8<-- [end:innerdf] + + // --8<-- [start:innerdf2] + let df_orders = df!( + "order_id"=> &["a", "b", "c"], + "customer_id"=> &[1, 2, 2], + "amount"=> &[100, 200, 300], + )?; + println!("{}", &df_orders); + // --8<-- [end:innerdf2] + + // --8<-- [start:inner] + let df_inner_customer_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Inner), + ) + .collect()?; + println!("{}", &df_inner_customer_join); + // --8<-- [end:inner] + + // --8<-- [start:left] + let df_left_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Left), + ) + .collect()?; + println!("{}", &df_left_join); + // --8<-- [end:left] + + // --8<-- [start:right] + let df_right_join = df_orders + .clone() + .lazy() + .join( + df_customers.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Right), + ) + .collect()?; + println!("{}", &df_right_join); + // --8<-- [end:right] + + // --8<-- [start:full] + let df_full_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Full), + ) + .collect()?; + println!("{}", &df_full_join); + // --8<-- [end:full] + + // --8<-- [start:full_coalesce] + let df_full_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), + ) + .collect()?; + println!("{}", &df_full_join); + // --8<-- [end:full_coalesce] + + // --8<-- [start:df3] + let df_colors = df!( + "color"=> &["red", "blue", "green"], + )?; + println!("{}", &df_colors); + // --8<-- [end:df3] + + // --8<-- [start:df4] + let df_sizes = df!( + "size"=> &["S", "M", "L"], + )?; + println!("{}", &df_sizes); + // --8<-- [end:df4] + + // --8<-- [start:cross] + let df_cross_join = df_colors + .clone() + .lazy() + .cross_join(df_sizes.clone().lazy(), None) + .collect()?; + println!("{}", &df_cross_join); + // --8<-- [end:cross] + + // --8<-- [start:df5] + let df_cars = df!( + "id"=> &["a", "b", "c"], + "make"=> &["ford", "toyota", "bmw"], + )?; + println!("{}", &df_cars); + // --8<-- [end:df5] + + // --8<-- [start:df6] + let df_repairs = df!( + "id"=> &["c", "c"], + "cost"=> &[100, 200], + )?; + println!("{}", &df_repairs); + // --8<-- [end:df6] + + // --8<-- [start:inner2] + let df_inner_join = df_cars + .clone() + .lazy() + .inner_join(df_repairs.clone().lazy(), col("id"), col("id")) + .collect()?; + println!("{}", &df_inner_join); + // --8<-- [end:inner2] + + // --8<-- [start:semi] + let df_semi_join = df_cars + .clone() + .lazy() + .join( + df_repairs.clone().lazy(), + [col("id")], + [col("id")], + JoinArgs::new(JoinType::Semi), + ) + .collect()?; + println!("{}", &df_semi_join); + // --8<-- [end:semi] + + // --8<-- [start:anti] + let df_anti_join = df_cars + .clone() + .lazy() + .join( + df_repairs.clone().lazy(), + [col("id")], + [col("id")], + JoinArgs::new(JoinType::Anti), + ) + .collect()?; + println!("{}", &df_anti_join); + // --8<-- [end:anti] + + // --8<-- [start:df7] + use chrono::prelude::*; + let df_trades = df!( + "time"=> &[ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock"=> &["A", "B", "B", "C"], + "trade"=> &[101, 299, 301, 500], + )?; + println!("{}", &df_trades); + // --8<-- [end:df7] + + // --8<-- [start:df8] + let df_quotes = df!( + "time"=> &[ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock"=> &["A", "B", "C", "A"], + "quote"=> &[100, 300, 501, 102], + )?; + + println!("{}", &df_quotes); + // --8<-- [end:df8] + + // --8<-- [start:asofpre] + let df_trades = df_trades + .sort( + ["time"], + SortMultipleOptions::default().with_maintain_order(true), + ) + .unwrap(); + let df_quotes = df_quotes + .sort( + ["time"], + SortMultipleOptions::default().with_maintain_order(true), + ) + .unwrap(); + // --8<-- [end:asofpre] + + // --8<-- [start:asof] + let df_asof_join = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + None, + )?; + println!("{}", &df_asof_join); + // --8<-- [end:asof] + + // --8<-- [start:asof2] + let df_asof_tolerance_join = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), + )?; + println!("{}", &df_asof_tolerance_join); + // --8<-- [end:asof2] + + Ok(()) +} diff --git a/docs/source/src/rust/user-guide/transformations/joins.rs b/docs/source/src/rust/user-guide/transformations/joins.rs index 5caa0cc4ac18..557bfb2f7bd6 100644 --- a/docs/source/src/rust/user-guide/transformations/joins.rs +++ b/docs/source/src/rust/user-guide/transformations/joins.rs @@ -3,241 +3,67 @@ use polars::prelude::*; // --8<-- [end:setup] fn main() -> Result<(), Box> { - // --8<-- [start:innerdf] - let df_customers = df! ( - - "customer_id" => &[1, 2, 3], - "name" => &["Alice", "Bob", "Charlie"], - )?; - - println!("{}", &df_customers); - // --8<-- [end:innerdf] - - // --8<-- [start:innerdf2] - let df_orders = df!( - "order_id"=> &["a", "b", "c"], - "customer_id"=> &[1, 2, 2], - "amount"=> &[100, 200, 300], - )?; - println!("{}", &df_orders); - // --8<-- [end:innerdf2] - - // --8<-- [start:inner] - let df_inner_customer_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Inner), - ) - .collect()?; - println!("{}", &df_inner_customer_join); - // --8<-- [end:inner] - - // --8<-- [start:left] - let df_left_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Left), - ) - .collect()?; - println!("{}", &df_left_join); - // --8<-- [end:left] - - // --8<-- [start:right] - let df_right_join = df_orders - .clone() - .lazy() - .join( - df_customers.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Right), - ) - .collect()?; - println!("{}", &df_right_join); - // --8<-- [end:right] - - // --8<-- [start:full] - let df_full_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full), - ) - .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full] - - // --8<-- [start:full_coalesce] - let df_full_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), - ) - .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full_coalesce] - - // --8<-- [start:df3] - let df_colors = df!( - "color"=> &["red", "blue", "green"], - )?; - println!("{}", &df_colors); - // --8<-- [end:df3] - - // --8<-- [start:df4] - let df_sizes = df!( - "size"=> &["S", "M", "L"], - )?; - println!("{}", &df_sizes); - // --8<-- [end:df4] - - // --8<-- [start:cross] - let df_cross_join = df_colors - .clone() - .lazy() - .cross_join(df_sizes.clone().lazy(), None) - .collect()?; - println!("{}", &df_cross_join); - // --8<-- [end:cross] - - // --8<-- [start:df5] - let df_cars = df!( - "id"=> &["a", "b", "c"], - "make"=> &["ford", "toyota", "bmw"], - )?; - println!("{}", &df_cars); - // --8<-- [end:df5] - - // --8<-- [start:df6] - let df_repairs = df!( - "id"=> &["c", "c"], - "cost"=> &[100, 200], - )?; - println!("{}", &df_repairs); - // --8<-- [end:df6] - - // --8<-- [start:inner2] - let df_inner_join = df_cars - .clone() - .lazy() - .inner_join(df_repairs.clone().lazy(), col("id"), col("id")) - .collect()?; - println!("{}", &df_inner_join); - // --8<-- [end:inner2] - - // --8<-- [start:semi] - let df_semi_join = df_cars - .clone() - .lazy() - .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], - JoinArgs::new(JoinType::Semi), - ) - .collect()?; - println!("{}", &df_semi_join); - // --8<-- [end:semi] - - // --8<-- [start:anti] - let df_anti_join = df_cars - .clone() - .lazy() - .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], - JoinArgs::new(JoinType::Anti), - ) - .collect()?; - println!("{}", &df_anti_join); - // --8<-- [end:anti] - - // --8<-- [start:df7] - use chrono::prelude::*; - let df_trades = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "B", "C"], - "trade"=> &[101, 299, 301, 500], - )?; - println!("{}", &df_trades); - // --8<-- [end:df7] - - // --8<-- [start:df8] - let df_quotes = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "C", "A"], - "quote"=> &[100, 300, 501, 102], - )?; - - println!("{}", &df_quotes); - // --8<-- [end:df8] - - // --8<-- [start:asofpre] - let df_trades = df_trades - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - let df_quotes = df_quotes - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - // --8<-- [end:asofpre] + // --8<-- [start:props_groups] + // --8<-- [end:props_groups] + + // --8<-- [start:props_prices] + // --8<-- [end:props_prices] + + // --8<-- [start:equi-join] + // --8<-- [end:equi-join] + + // --8<-- [start:props_groups2] + // --8<-- [end:props_groups2] + + // --8<-- [start:props_prices2] + // --8<-- [end:props_prices2] + + // --8<-- [start:join-key-expression] + // --8<-- [end:join-key-expression] + + // --8<-- [start:inner-join] + // --8<-- [end:inner-join] + + // --8<-- [start:left-join] + // --8<-- [end:left-join] + + // --8<-- [start:right-join] + // --8<-- [end:right-join] + + // --8<-- [start:left-right-join-equals] + // --8<-- [end:left-right-join-equals] + + // --8<-- [start:full-join] + // --8<-- [end:full-join] + + // --8<-- [start:full-join-coalesce] + // --8<-- [end:full-join-coalesce] + + // --8<-- [start:semi-join] + // --8<-- [end:semi-join] + + // --8<-- [start:anti-join] + // --8<-- [end:anti-join] + + // --8<-- [start:players] + // --8<-- [end:players] + + // --8<-- [start:non-equi] + // --8<-- [end:non-equi] + + // --8<-- [start:df_trades] + // --8<-- [end:df_trades] + + // --8<-- [start:df_quotes] + // --8<-- [end:df_quotes] // --8<-- [start:asof] - let df_asof_join = df_trades.join_asof_by( - &df_quotes, - "time", - "time", - ["stock"], - ["stock"], - AsofStrategy::Backward, - None, - )?; - println!("{}", &df_asof_join); - // --8<-- [end:asof] - - // --8<-- [start:asof2] - let df_asof_tolerance_join = df_trades.join_asof_by( - &df_quotes, - "time", - "time", - ["stock"], - ["stock"], - AsofStrategy::Backward, - Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), - )?; - println!("{}", &df_asof_tolerance_join); - // --8<-- [end:asof2] + + // --8<-- [start:asof-tolerance] + // --8<-- [end:asof-tolerance] + + // --8<-- [start:cartesian-product] + // --8<-- [end:cartesian-product] Ok(()) } diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index 7cf07e680503..b3dd750d950f 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -1,229 +1,234 @@ # Joins -## Join strategies - -Polars supports the following join strategies by specifying the `how` argument: +A join is a dataframe operation in which the rows of two dataframes are concatenated horizontally according to a “joining strategy” and matching criteria. -| Strategy | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | -| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | -| `right` | Returns all rows in the right dataframe, whether or not a match in the left-frame is found. Non-matching rows have their left columns null-filled. | -| `full` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | -| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | -| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | -| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | +The most common type of join is an “equi join”, in which rows are matched by a key expression. +Polars supports several joining strategies for equi joins, which determine exactly how we handle the matching of rows. +Polars also supports “non-equi joins”, a type of join where the matching criterion is not an equality, and a type of join where rows are matched by key proximity, called “asof join”. -A separate `coalesce` parameter determines whether to merge key columns with the same name from the left and right -frames. +We show examples of all these types of joins below. +For that, we will be loading some (modified) Monopoly property data: -### Inner join +{{code_block('user-guide/transformations/joins','props_groups',[])}} -An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's -take for example the following two `DataFrames`: +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_groups" +``` -{{code_block('user-guide/transformations/joins','innerdf',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','props_prices',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:setup" ---8<-- "python/user-guide/transformations/joins.py:innerdf" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_prices" ``` -

+## Equi joins + +In an equi join, rows are matched by checking equality of a key expression. +You can do an equi join with the function `join` by specifying the name of the column to be used as key: -{{code_block('user-guide/transformations/joins','innerdf2',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','equi-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:innerdf2" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:equi-join" ``` -To get a `DataFrame` with the orders and their associated customer we can do an `inner` join on the `customer_id` -column: +The result has four rows but both dataframes used in the operation had five rows. +Polars uses a joining strategy to determine what happens with rows that have multiple matches or with rows that have no match at all. +By default, Polars computes an “inner join” but there are [other join strategies that we show next](#join-strategies). -{{code_block('user-guide/transformations/joins','inner',['join'])}} +In the example above, the two dataframes conveniently had the column we wish to use as key with the same name and with the values in the exact same format. +Suppose, for the sake of argument, that one of the dataframes had a differently named column and the other had the property names in lower case: -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner" +{{code_block('user-guide/transformations/joins','props_groups2',[])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_groups2" ``` -### Left join +{{code_block('user-guide/transformations/joins','props_prices2',[])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_prices2" +``` -The `left` outer join produces a `DataFrame` that contains all the rows from the left `DataFrame` and only the rows from -the right `DataFrame` where the join key exists in the left `DataFrame`. If we now take the example from above and want -to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an -order or not) we can do a `left` join: +In a situation like this, where we may want to perform the same join as before, we can leverage `join`'s flexibility and specify arbitrary expressions to compute the joining key on the left and on the right, allowing one to compute row keys dynamically: -{{code_block('user-guide/transformations/joins','left',['join'])}} +{{code_block('user-guide/transformations/joins','join-key-expression',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:left" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:join-key-expression" ``` -Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this -customer. +Because we are joining on the right with an expression, Polars preserves the column “property_name” from the left and the column “name” from the right so we can have access to the original values that the key expressions were applied to. -### Right join +## Join strategies -The `right` outer join produces a `DataFrame` that contains all the rows from the right `DataFrame` and only the rows from -the left `DataFrame` where the join key exists in the right `DataFrame`. If we now take the example from above and want -to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an -order or not) we can do a `right` join: +When computing a join with `df1.join(df2, ...)`, we can specify one of many different join strategies. +A join strategy specifies what rows to keep from each dataframe based on whether they match rows from the other dataframe. -{{code_block('user-guide/transformations/joins','right',['join'])}} +### Inner join -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:right" -``` +In an inner join the resulting dataframe only contains the rows from the left and right dataframes that matched. +That is the default strategy used by `join` and above we can see an example of that. +We repeat the example here and explicitly specify the join strategy: -Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this -customer. +{{code_block('user-guide/transformations/joins','inner-join',['join'])}} -### Outer join +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:inner-join" +``` -The `full` outer join produces a `DataFrame` that contains all the rows from both `DataFrames`. Columns are null, if the -join key does not exist in the source `DataFrame`. Doing a `full` outer join on the two `DataFrames` from above produces -a similar `DataFrame` to the `left` join: +The result does not include the row from `props_groups` that contains “The Shire” and the result also does not include the row from `props_prices` that contains “Sesame Street”. -{{code_block('user-guide/transformations/joins','full',['join'])}} +### Left join -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:full" -``` +A left outer join is a join where the result contains all the rows from the left dataframe and the rows of the right dataframe that matched any rows from the left dataframe. -{{code_block('user-guide/transformations/joins','full_coalesce',['join'])}} +{{code_block('user-guide/transformations/joins','left-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:full_coalesce" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:left-join" ``` -### Cross join +If there are any rows from the left dataframe that have no matching rows on the right dataframe, they get the value `null` on the new columns. -A `cross` join is a Cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is -joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible -combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. +### Right join -{{code_block('user-guide/transformations/joins','df3',['DataFrame'])}} +Computationally speaking, a right outer join is exactly the same as a left outer join, but with the arguments swapped. +Here is an example: -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df3" +{{code_block('user-guide/transformations/joins','right-join',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:right-join" ``` -

+We show that `df1.join(df2, how="right", ...)` is the same as `df2.join(df1, how="left", ...)`, up to the order of the columns of the result, with the computation below: -{{code_block('user-guide/transformations/joins','df4',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','left-right-join-equals',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df4" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:left-right-join-equals" ``` -We can now create a `DataFrame` containing all possible combinations of the colors and sizes with a `cross` join: +### Full join + +A full outer join will keep all of the rows from the left and right dataframes, even if they don't have matching rows in the other dataframe: -{{code_block('user-guide/transformations/joins','cross',['join'])}} +{{code_block('user-guide/transformations/joins','full-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:cross" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:full-join" ``` -
+In this case, we see that we get two columns `property_name` and `property_name_right` to make up for the fact that we are matching on the column `property_name` of both dataframes and there are some names for which there are no matches. +The two columns help differentiate the source of each row data. +If we wanted to force `join` to coalesce the two columns `property_name` into a single column, we could set `coalesce=True` explicitly: + +{{code_block('user-guide/transformations/joins','full-join-coalesce',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:full-join-coalesce" +``` -The `inner`, `left`, `right`, `full` and `cross` join strategies are standard amongst dataframe libraries. We provide more -details on the less familiar `semi`, `anti` and `asof` join strategies below. +When not set, the parameter `coalesce` is join-specific, which is why the inner, left, and right, joins act as if `coalesce=True`, even though we didn't set it. ### Semi join -The `semi` join returns all rows from the left frame in which the join key is also present in the right frame. Consider -the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a -unique `id`. +A semi join will return the rows of the left dataframe that have a match in the right dataframe, but we do not actually join the matching rows: -{{code_block('user-guide/transformations/joins','df5',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','semi-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df5" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:semi-join" ``` -The company has another `DataFrame` showing each repair job carried out on a vehicle. +A semi join acts as a sort of row filter based on a second dataframe. -{{code_block('user-guide/transformations/joins','df6',['DataFrame'])}} +### Anti join -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df6" -``` +Conversely, an anti join will return the rows of the left dataframe that do not have a match in the right dataframe: -You want to answer this question: which of the cars have had repairs carried out? +{{code_block('user-guide/transformations/joins','anti-join',['join'])}} -An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that -has had multiple repair jobs: +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:anti-join" +``` -{{code_block('user-guide/transformations/joins','inner2',['join'])}} +## Non-equi joins -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner2" -``` +In a non-equi join matches between the left and right dataframes are computed differently. +Instead of looking for matches on key expressions, we provide a single predicate that determines what rows of the left dataframe can be paired up with what rows of the right dataframe. -However, a semi join produces a single row for each car that has had a repair job carried out. +For example, consider the following Monopoly players and their current cash: -{{code_block('user-guide/transformations/joins','semi',['join'])}} +{{code_block('user-guide/transformations/joins','players',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:semi" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:players" ``` -### Anti join - -Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? -An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in -the `df_repairs` `DataFrame`. +Using a non-equi join we can easily build a dataframe with all the possible properties that each player could be interested in buying. +We use the function `join_where` to compute a non-equi join: -{{code_block('user-guide/transformations/joins','anti',['join'])}} +{{code_block('user-guide/transformations/joins','non-equi',['join_where'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:anti" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:non-equi" ``` +You can provide multiple expressions as predicates but they all must use Boolean comparison operators and must refer to columns from both dataframes. + +!!! note +`join_where` is still experimental and doesn't yet support arbitrary Boolean expressions as predicates. + ## Asof join An `asof` join is like a left join except that we match on nearest key rather than equal keys. In Polars we can do an asof join with the `join_asof` method. -Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has -made for different stocks. +For the asof join we will consider a scenario inspired by the stock market. +Suppose a stock market broker has a dataframe called `df_trades` showing transactions it has made for different stocks. -{{code_block('user-guide/transformations/joins','df7',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','df_trades',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df7" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df_trades" ``` -The broker has another `DataFrame` called `df_quotes` showing prices it has quoted for these stocks. +The broker has another dataframe called `df_quotes` showing prices it has quoted for these stocks: -{{code_block('user-guide/transformations/joins','df8',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','df_quotes',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df8" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df_quotes" ``` -You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this -with `join_asof` (using the default `strategy = "backward"`). -To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the -stock column with `by="stock"`. +You want to produce a dataframe showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). +To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. {{code_block('user-guide/transformations/joins','asof',['join_asof'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asofpre" +```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof" ``` -If you want to make sure that only quotes within a certain time range are joined to the trades you can specify -the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the -trade so we set `tolerance = "1m"`. +If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. +In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. -=== ":fontawesome-brands-python: Python" +{{code_block('user-guide/transformations/joins','asof-tolerance',['join_asof'])}} -```python ---8<-- "python/user-guide/transformations/joins.py:asof2" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:asof-tolerance" ``` -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asof2" +## Cartesian product + +Polars allows you to compute the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of two dataframes, producing a dataframe where all rows of the left dataframe are paired up with all the rows of the right dataframe. +To compute the Cartesian product of two dataframes, you can pass the strategy `how="cross"` to the function `join` without specifying any of `on`, `left_on`, and `right_on`: + +{{code_block('user-guide/transformations/joins','cartesian-product',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:cartesian-product" ``` From 1130e56264028ebd0afea7e19e511bcda342570b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Tue, 8 Oct 2024 12:52:34 +0100 Subject: [PATCH 02/11] Add quick ref table to joins. [skip ci] --- .../user-guide/transformations/joins.md | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index b3dd750d950f..75982facb9fb 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -6,8 +6,43 @@ The most common type of join is an “equi join”, in which rows are matched by Polars supports several joining strategies for equi joins, which determine exactly how we handle the matching of rows. Polars also supports “non-equi joins”, a type of join where the matching criterion is not an equality, and a type of join where rows are matched by key proximity, called “asof join”. -We show examples of all these types of joins below. -For that, we will be loading some (modified) Monopoly property data: +## Quick reference table + +The table below acts as a quick reference for people who know what they are looking for. +If you want to learn about joins in general and how to work with them in Polars, feel free to skip the table and keep reading below. + +=== ":fontawesome-brands-python: Python" +[:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) +[:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) +[:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + +=== ":fontawesome-brands-rust: Rust" +[:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) +[:material-api: `join_asof`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) +[:material-flag-plus: Available on feature polars-ops](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag polars-ops"){.feature-flag} + + [:material-api: `join_where`](https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where) + [:material-flag-plus: Available on feature lazy](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag lazy"){.feature-flag} + +| Type | Function | Brief description | +| --------------------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Equi inner join | `join(..., how="inner")` | Keeps rows that matched both on the left and right. | +| Equi left outer join | `join(..., how="left")` | Keeps all rows from the left plus matching rows from the right. Non-matching rows from the left have their right columns filled with `null`. | +| Equi right outer join | `join(..., how="right")` | Keeps all rows from the right plus matching rows from the left. Non-matching rows from the right have their left columns filled with `null`. | +| Equi full join | `join(..., how="full")` | Keeps all rows from either dataframe, regardless of whether they match or not. Non-matching rows from one side have the columns from the other side filled with `null`. | +| Equi semi join | `join(..., how="semi")` | Keeps rows from the left that have a match on the right. | +| Equi anti join | `join(..., how="anti")` | Keeps rows from the left that do not have a match on the right. | +| Non-equi inner join | `join_where` | Finds all possible pairings of rows from the left and right that satisfy the given predicate(s). | +| Asof join | `join_asof` | Like a left outer join, but matches on the nearest key instead of on exact key matches. | +| Cartesian product | `join(..., how="cross")` | Computes the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of the two dataframes. | + +## Equi joins + +In an equi join, rows are matched by checking equality of a key expression. +You can do an equi join with the function `join` by specifying the name of the column to be used as key. +For the examples, we will be loading some (modified) Monopoly property data. + +First, we load a dataframe that contains property names and their colour group in the game: {{code_block('user-guide/transformations/joins','props_groups',[])}} @@ -15,16 +50,15 @@ For that, we will be loading some (modified) Monopoly property data: --8<-- "python/user-guide/transformations/joins.py:props_groups" ``` +Next, we load a dataframe that contains property names and their price in the game: + {{code_block('user-guide/transformations/joins','props_prices',[])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:props_prices" ``` -## Equi joins - -In an equi join, rows are matched by checking equality of a key expression. -You can do an equi join with the function `join` by specifying the name of the column to be used as key: +Now, we join both dataframes to create a dataframe that contains property names, colour groups, and prices: {{code_block('user-guide/transformations/joins','equi-join',['join'])}} From d338c7b78b08caa7f64bb2e7c09bd84448523e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Tue, 8 Oct 2024 13:36:30 +0100 Subject: [PATCH 03/11] Add missing data. [skip ci] --- docs/assets/data/monopoly.csv | 29 +++++++++++++++++++++ docs/assets/data/monopoly_props_groups.csv | 30 ++++++++++++++++++++++ docs/assets/data/monopoly_props_prices.csv | 30 ++++++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 docs/assets/data/monopoly.csv create mode 100644 docs/assets/data/monopoly_props_groups.csv create mode 100644 docs/assets/data/monopoly_props_prices.csv diff --git a/docs/assets/data/monopoly.csv b/docs/assets/data/monopoly.csv new file mode 100644 index 000000000000..055b5e8f6c71 --- /dev/null +++ b/docs/assets/data/monopoly.csv @@ -0,0 +1,29 @@ +property_name,cost,group +Old Ken Road,60,brown +Whitechapel Road,60,brown +Kings Cross Station,200,stations +"The Angel, Islington",100,light_blue +Euston Road,100,light_blue +Pentonville Road,120,light_blue +Pall Mall,140,pink +Electric Company,150,utilities +Whitehall,140,pink +Northumberland Avenue,160,pink +Marylebone Station,200,stations +Bow Street,180,orange +Marlborough Street,180,orange +Vine Street,200,orange +Strand,220,red +Fleet Street,220,red +Trafalgar Square,240,red +Fenchurch St Station,200,stations +Leicester Square,260,yellow +Coventry Street,260,yellow +Water Works,150,utilities +Piccadilly,280,yellow +Regent Street,300,green +Oxford Street,300,green +Bond Street,320,green +Liverpool Street Station,200,stations +Park Lane,350,dark_blue +Mayfair,400,dark_blue diff --git a/docs/assets/data/monopoly_props_groups.csv b/docs/assets/data/monopoly_props_groups.csv new file mode 100644 index 000000000000..1dc6088bd0cc --- /dev/null +++ b/docs/assets/data/monopoly_props_groups.csv @@ -0,0 +1,30 @@ +property_name,group +Old Ken Road,brown +Whitechapel Road,brown +The Shire,fantasy +Kings Cross Station,stations +"The Angel, Islington",light_blue +Euston Road,light_blue +Pentonville Road,light_blue +Pall Mall,pink +Electric Company,utilities +Whitehall,pink +Northumberland Avenue,pink +Marylebone Station,stations +Bow Street,orange +Marlborough Street,orange +Vine Street,orange +Strand,red +Fleet Street,red +Trafalgar Square,red +Fenchurch St Station,stations +Leicester Square,yellow +Coventry Street,yellow +Water Works,utilities +Piccadilly,yellow +Regent Street,green +Oxford Street,green +Bond Street,green +Liverpool Street Station,stations +Park Lane,dark_blue +Mayfair,dark_blue diff --git a/docs/assets/data/monopoly_props_prices.csv b/docs/assets/data/monopoly_props_prices.csv new file mode 100644 index 000000000000..0116cc5c1525 --- /dev/null +++ b/docs/assets/data/monopoly_props_prices.csv @@ -0,0 +1,30 @@ +property_name,cost +Old Ken Road,60 +Whitechapel Road,60 +Sesame Street,100 +Kings Cross Station,200 +"The Angel, Islington",100 +Euston Road,100 +Pentonville Road,120 +Pall Mall,140 +Electric Company,150 +Whitehall,140 +Northumberland Avenue,160 +Marylebone Station,200 +Bow Street,180 +Marlborough Street,180 +Vine Street,200 +Strand,220 +Fleet Street,220 +Trafalgar Square,240 +Fenchurch St Station,200 +Leicester Square,260 +Coventry Street,260 +Water Works,150 +Piccadilly,280 +Regent Street,300 +Oxford Street,300 +Bond Street,320 +Liverpool Street Station,200 +Park Lane,350 +Mayfair,400 From b9a0b95d2a4204f42159df0bdbf58e0f2f9739ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 09:03:15 +0100 Subject: [PATCH 04/11] Address review feedback. [skip ci] --- docs/assets/data/monopoly.csv | 29 ------------------ docs/assets/data/monopoly_props_groups.csv | 30 ------------------- docs/assets/data/monopoly_props_prices.csv | 30 ------------------- .../user-guide/transformations/joins.py | 8 +++-- .../user-guide/transformations/joins.md | 5 ++-- 5 files changed, 9 insertions(+), 93 deletions(-) delete mode 100644 docs/assets/data/monopoly.csv delete mode 100644 docs/assets/data/monopoly_props_groups.csv delete mode 100644 docs/assets/data/monopoly_props_prices.csv diff --git a/docs/assets/data/monopoly.csv b/docs/assets/data/monopoly.csv deleted file mode 100644 index 055b5e8f6c71..000000000000 --- a/docs/assets/data/monopoly.csv +++ /dev/null @@ -1,29 +0,0 @@ -property_name,cost,group -Old Ken Road,60,brown -Whitechapel Road,60,brown -Kings Cross Station,200,stations -"The Angel, Islington",100,light_blue -Euston Road,100,light_blue -Pentonville Road,120,light_blue -Pall Mall,140,pink -Electric Company,150,utilities -Whitehall,140,pink -Northumberland Avenue,160,pink -Marylebone Station,200,stations -Bow Street,180,orange -Marlborough Street,180,orange -Vine Street,200,orange -Strand,220,red -Fleet Street,220,red -Trafalgar Square,240,red -Fenchurch St Station,200,stations -Leicester Square,260,yellow -Coventry Street,260,yellow -Water Works,150,utilities -Piccadilly,280,yellow -Regent Street,300,green -Oxford Street,300,green -Bond Street,320,green -Liverpool Street Station,200,stations -Park Lane,350,dark_blue -Mayfair,400,dark_blue diff --git a/docs/assets/data/monopoly_props_groups.csv b/docs/assets/data/monopoly_props_groups.csv deleted file mode 100644 index 1dc6088bd0cc..000000000000 --- a/docs/assets/data/monopoly_props_groups.csv +++ /dev/null @@ -1,30 +0,0 @@ -property_name,group -Old Ken Road,brown -Whitechapel Road,brown -The Shire,fantasy -Kings Cross Station,stations -"The Angel, Islington",light_blue -Euston Road,light_blue -Pentonville Road,light_blue -Pall Mall,pink -Electric Company,utilities -Whitehall,pink -Northumberland Avenue,pink -Marylebone Station,stations -Bow Street,orange -Marlborough Street,orange -Vine Street,orange -Strand,red -Fleet Street,red -Trafalgar Square,red -Fenchurch St Station,stations -Leicester Square,yellow -Coventry Street,yellow -Water Works,utilities -Piccadilly,yellow -Regent Street,green -Oxford Street,green -Bond Street,green -Liverpool Street Station,stations -Park Lane,dark_blue -Mayfair,dark_blue diff --git a/docs/assets/data/monopoly_props_prices.csv b/docs/assets/data/monopoly_props_prices.csv deleted file mode 100644 index 0116cc5c1525..000000000000 --- a/docs/assets/data/monopoly_props_prices.csv +++ /dev/null @@ -1,30 +0,0 @@ -property_name,cost -Old Ken Road,60 -Whitechapel Road,60 -Sesame Street,100 -Kings Cross Station,200 -"The Angel, Islington",100 -Euston Road,100 -Pentonville Road,120 -Pall Mall,140 -Electric Company,150 -Whitehall,140 -Northumberland Avenue,160 -Marylebone Station,200 -Bow Street,180 -Marlborough Street,180 -Vine Street,200 -Strand,220 -Fleet Street,220 -Trafalgar Square,240 -Fenchurch St Station,200 -Leicester Square,260 -Coventry Street,260 -Water Works,150 -Piccadilly,280 -Regent Street,300 -Oxford Street,300 -Bond Street,320 -Liverpool Street Station,200 -Park Lane,350 -Mayfair,400 diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index fe967cf0bea0..fe3f2dbb5091 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,12 +1,16 @@ # --8<-- [start:props_groups] import polars as pl -props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5) +props_groups = pl.read_csv( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv" +).head(5) print(props_groups) # --8<-- [end:props_groups] # --8<-- [start:props_prices] -props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5) +props_prices = pl.read_csv( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv" +).head(5) print(props_prices) # --8<-- [end:props_prices] diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index 75982facb9fb..e56aa7d82d83 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -1,6 +1,7 @@ # Joins -A join is a dataframe operation in which the rows of two dataframes are concatenated horizontally according to a “joining strategy” and matching criteria. +A join operation combines columns from one or more dataframes into a new dataframe. +The different “joining strategies” and matching criteria used by the different types of joins influence how columns are combined and also what rows are included in the result of the join operation. The most common type of join is an “equi join”, in which rows are matched by a key expression. Polars supports several joining strategies for equi joins, which determine exactly how we handle the matching of rows. @@ -211,7 +212,7 @@ We use the function `join_where` to compute a non-equi join: --8<-- "python/user-guide/transformations/joins.py:non-equi" ``` -You can provide multiple expressions as predicates but they all must use Boolean comparison operators and must refer to columns from both dataframes. +You can provide multiple expressions as predicates but they all must use comparison operators that evaluate to a Boolean result and must refer to columns from both dataframes. !!! note `join_where` is still experimental and doesn't yet support arbitrary Boolean expressions as predicates. From ca5e6aa3d1669f4e6580e42a37314e44070282ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:16:15 +0100 Subject: [PATCH 05/11] Translate code examples to Rust. --- .../user-guide/transformations/_joins.py | 162 ------------ .../user-guide/transformations/joins.py | 19 +- docs/source/src/rust/Cargo.toml | 2 +- .../rust/user-guide/transformations/_joins.rs | 243 ------------------ .../rust/user-guide/transformations/joins.rs | 219 ++++++++++++++++ 5 files changed, 231 insertions(+), 414 deletions(-) delete mode 100644 docs/source/src/python/user-guide/transformations/_joins.py delete mode 100644 docs/source/src/rust/user-guide/transformations/_joins.rs diff --git a/docs/source/src/python/user-guide/transformations/_joins.py b/docs/source/src/python/user-guide/transformations/_joins.py deleted file mode 100644 index a34ea310e614..000000000000 --- a/docs/source/src/python/user-guide/transformations/_joins.py +++ /dev/null @@ -1,162 +0,0 @@ -# --8<-- [start:setup] -import polars as pl -from datetime import datetime - -# --8<-- [end:setup] - -# --8<-- [start:innerdf] -df_customers = pl.DataFrame( - { - "customer_id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - } -) -print(df_customers) -# --8<-- [end:innerdf] - -# --8<-- [start:innerdf2] -df_orders = pl.DataFrame( - { - "order_id": ["a", "b", "c"], - "customer_id": [1, 2, 2], - "amount": [100, 200, 300], - } -) -print(df_orders) -# --8<-- [end:innerdf2] - - -# --8<-- [start:inner] -df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner") -print(df_inner_customer_join) -# --8<-- [end:inner] - -# --8<-- [start:left] -df_left_join = df_customers.join(df_orders, on="customer_id", how="left") -print(df_left_join) -# --8<-- [end:left] - -# --8<-- [start:right] -df_right_join = df_orders.join(df_customers, on="customer_id", how="right") -print(df_right_join) -# --8<-- [end:right] - -# --8<-- [start:full] -df_outer_join = df_customers.join(df_orders, on="customer_id", how="full") -print(df_outer_join) -# --8<-- [end:full] - -# --8<-- [start:full_coalesce] -df_outer_coalesce_join = df_customers.join( - df_orders, on="customer_id", how="full", coalesce=True -) -print(df_outer_coalesce_join) -# --8<-- [end:full_coalesce] - -# --8<-- [start:df3] -df_colors = pl.DataFrame( - { - "color": ["red", "blue", "green"], - } -) -print(df_colors) -# --8<-- [end:df3] - -# --8<-- [start:df4] -df_sizes = pl.DataFrame( - { - "size": ["S", "M", "L"], - } -) -print(df_sizes) -# --8<-- [end:df4] - -# --8<-- [start:cross] -df_cross_join = df_colors.join(df_sizes, how="cross") -print(df_cross_join) -# --8<-- [end:cross] - -# --8<-- [start:df5] -df_cars = pl.DataFrame( - { - "id": ["a", "b", "c"], - "make": ["ford", "toyota", "bmw"], - } -) -print(df_cars) -# --8<-- [end:df5] - -# --8<-- [start:df6] -df_repairs = pl.DataFrame( - { - "id": ["c", "c"], - "cost": [100, 200], - } -) -print(df_repairs) -# --8<-- [end:df6] - -# --8<-- [start:inner2] -df_inner_join = df_cars.join(df_repairs, on="id", how="inner") -print(df_inner_join) -# --8<-- [end:inner2] - -# --8<-- [start:semi] -df_semi_join = df_cars.join(df_repairs, on="id", how="semi") -print(df_semi_join) -# --8<-- [end:semi] - -# --8<-- [start:anti] -df_anti_join = df_cars.join(df_repairs, on="id", how="anti") -print(df_anti_join) -# --8<-- [end:anti] - -# --8<-- [start:df7] -df_trades = pl.DataFrame( - { - "time": [ - datetime(2020, 1, 1, 9, 1, 0), - datetime(2020, 1, 1, 9, 1, 0), - datetime(2020, 1, 1, 9, 3, 0), - datetime(2020, 1, 1, 9, 6, 0), - ], - "stock": ["A", "B", "B", "C"], - "trade": [101, 299, 301, 500], - } -) -print(df_trades) -# --8<-- [end:df7] - -# --8<-- [start:df8] -df_quotes = pl.DataFrame( - { - "time": [ - datetime(2020, 1, 1, 9, 0, 0), - datetime(2020, 1, 1, 9, 2, 0), - datetime(2020, 1, 1, 9, 4, 0), - datetime(2020, 1, 1, 9, 6, 0), - ], - "stock": ["A", "B", "C", "A"], - "quote": [100, 300, 501, 102], - } -) - -print(df_quotes) -# --8<-- [end:df8] - -# --8<-- [start:asofpre] -df_trades = df_trades.sort("time") -df_quotes = df_quotes.sort("time") # Set column as sorted -# --8<-- [end:asofpre] - -# --8<-- [start:asof] -df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock") -print(df_asof_join) -# --8<-- [end:asof] - -# --8<-- [start:asof2] -df_asof_tolerance_join = df_trades.join_asof( - df_quotes, on="time", by="stock", tolerance="1m" -) -print(df_asof_tolerance_join) -# --8<-- [end:asof2] diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index fe3f2dbb5091..e8d13c1dad43 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,16 +1,12 @@ # --8<-- [start:props_groups] import polars as pl -props_groups = pl.read_csv( - "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv" -).head(5) +props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5) print(props_groups) # --8<-- [end:props_groups] # --8<-- [start:props_prices] -props_prices = pl.read_csv( - "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv" -).head(5) +props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5) print(props_prices) # --8<-- [end:props_prices] @@ -20,7 +16,9 @@ # --8<-- [end:equi-join] # --8<-- [start:props_groups2] -props_groups2 = props_groups.with_columns(pl.col("property_name").str.to_lowercase()) +props_groups2 = props_groups.with_columns( + pl.col("property_name").str.to_lowercase(), +) print(props_groups2) # --8<-- [end:props_groups2] @@ -74,7 +72,12 @@ # --8<-- [end:full-join] # --8<-- [start:full-join-coalesce] -result = props_groups.join(props_prices, on="property_name", how="full", coalesce=True) +result = props_groups.join( + props_prices, + on="property_name", + how="full", + coalesce=True, +) print(result) # --8<-- [end:full-join-coalesce] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index 1bc09b3a8744..8a6607d4aa84 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -124,7 +124,7 @@ required-features = ["polars/lazy"] [[bin]] name = "user-guide-transformations-joins" path = "user-guide/transformations/joins.rs" -required-features = ["polars/lazy", "polars/asof_join"] +required-features = ["polars/lazy", "polars/strings", "polars/semi_anti_join", "polars/iejoin", "polars/cross_join"] [[bin]] name = "user-guide-transformations-unpivot" path = "user-guide/transformations/unpivot.rs" diff --git a/docs/source/src/rust/user-guide/transformations/_joins.rs b/docs/source/src/rust/user-guide/transformations/_joins.rs deleted file mode 100644 index 5caa0cc4ac18..000000000000 --- a/docs/source/src/rust/user-guide/transformations/_joins.rs +++ /dev/null @@ -1,243 +0,0 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] - -fn main() -> Result<(), Box> { - // --8<-- [start:innerdf] - let df_customers = df! ( - - "customer_id" => &[1, 2, 3], - "name" => &["Alice", "Bob", "Charlie"], - )?; - - println!("{}", &df_customers); - // --8<-- [end:innerdf] - - // --8<-- [start:innerdf2] - let df_orders = df!( - "order_id"=> &["a", "b", "c"], - "customer_id"=> &[1, 2, 2], - "amount"=> &[100, 200, 300], - )?; - println!("{}", &df_orders); - // --8<-- [end:innerdf2] - - // --8<-- [start:inner] - let df_inner_customer_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Inner), - ) - .collect()?; - println!("{}", &df_inner_customer_join); - // --8<-- [end:inner] - - // --8<-- [start:left] - let df_left_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Left), - ) - .collect()?; - println!("{}", &df_left_join); - // --8<-- [end:left] - - // --8<-- [start:right] - let df_right_join = df_orders - .clone() - .lazy() - .join( - df_customers.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Right), - ) - .collect()?; - println!("{}", &df_right_join); - // --8<-- [end:right] - - // --8<-- [start:full] - let df_full_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full), - ) - .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full] - - // --8<-- [start:full_coalesce] - let df_full_join = df_customers - .clone() - .lazy() - .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), - ) - .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full_coalesce] - - // --8<-- [start:df3] - let df_colors = df!( - "color"=> &["red", "blue", "green"], - )?; - println!("{}", &df_colors); - // --8<-- [end:df3] - - // --8<-- [start:df4] - let df_sizes = df!( - "size"=> &["S", "M", "L"], - )?; - println!("{}", &df_sizes); - // --8<-- [end:df4] - - // --8<-- [start:cross] - let df_cross_join = df_colors - .clone() - .lazy() - .cross_join(df_sizes.clone().lazy(), None) - .collect()?; - println!("{}", &df_cross_join); - // --8<-- [end:cross] - - // --8<-- [start:df5] - let df_cars = df!( - "id"=> &["a", "b", "c"], - "make"=> &["ford", "toyota", "bmw"], - )?; - println!("{}", &df_cars); - // --8<-- [end:df5] - - // --8<-- [start:df6] - let df_repairs = df!( - "id"=> &["c", "c"], - "cost"=> &[100, 200], - )?; - println!("{}", &df_repairs); - // --8<-- [end:df6] - - // --8<-- [start:inner2] - let df_inner_join = df_cars - .clone() - .lazy() - .inner_join(df_repairs.clone().lazy(), col("id"), col("id")) - .collect()?; - println!("{}", &df_inner_join); - // --8<-- [end:inner2] - - // --8<-- [start:semi] - let df_semi_join = df_cars - .clone() - .lazy() - .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], - JoinArgs::new(JoinType::Semi), - ) - .collect()?; - println!("{}", &df_semi_join); - // --8<-- [end:semi] - - // --8<-- [start:anti] - let df_anti_join = df_cars - .clone() - .lazy() - .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], - JoinArgs::new(JoinType::Anti), - ) - .collect()?; - println!("{}", &df_anti_join); - // --8<-- [end:anti] - - // --8<-- [start:df7] - use chrono::prelude::*; - let df_trades = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "B", "C"], - "trade"=> &[101, 299, 301, 500], - )?; - println!("{}", &df_trades); - // --8<-- [end:df7] - - // --8<-- [start:df8] - let df_quotes = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "C", "A"], - "quote"=> &[100, 300, 501, 102], - )?; - - println!("{}", &df_quotes); - // --8<-- [end:df8] - - // --8<-- [start:asofpre] - let df_trades = df_trades - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - let df_quotes = df_quotes - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - // --8<-- [end:asofpre] - - // --8<-- [start:asof] - let df_asof_join = df_trades.join_asof_by( - &df_quotes, - "time", - "time", - ["stock"], - ["stock"], - AsofStrategy::Backward, - None, - )?; - println!("{}", &df_asof_join); - // --8<-- [end:asof] - - // --8<-- [start:asof2] - let df_asof_tolerance_join = df_trades.join_asof_by( - &df_quotes, - "time", - "time", - ["stock"], - ["stock"], - AsofStrategy::Backward, - Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), - )?; - println!("{}", &df_asof_tolerance_join); - // --8<-- [end:asof2] - - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/transformations/joins.rs b/docs/source/src/rust/user-guide/transformations/joins.rs index 557bfb2f7bd6..621a8538afae 100644 --- a/docs/source/src/rust/user-guide/transformations/joins.rs +++ b/docs/source/src/rust/user-guide/transformations/joins.rs @@ -4,65 +4,284 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:props_groups] + let props_groups = CsvReadOptions::default() + .with_has_header(true) + .try_into_reader_with_file_path(Some( + "../../../assets/data/monopoly_props_groups.csv".into(), + ))? + .finish()? + .head(Some(5)); + println!("{}", props_groups); // --8<-- [end:props_groups] // --8<-- [start:props_prices] + let props_prices = CsvReadOptions::default() + .with_has_header(true) + .try_into_reader_with_file_path(Some( + "../../../assets/data/monopoly_props_prices.csv".into(), + ))? + .finish()? + .head(Some(5)); + println!("{}", props_prices); // --8<-- [end:props_prices] // --8<-- [start:equi-join] + // In Rust, we cannot use the shorthand of specifying a common + // column name just once. + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::default(), + ) + .collect()?; + println!("{}", result); // --8<-- [end:equi-join] // --8<-- [start:props_groups2] + let props_groups2 = props_groups + .clone() + .lazy() + .with_column(col("property_name").str().to_lowercase()) + .collect()?; + println!("{}", props_groups2); // --8<-- [end:props_groups2] // --8<-- [start:props_prices2] + let props_prices2 = props_prices + .clone() + .lazy() + .select([col("property_name").alias("name"), col("cost")]) + .collect()?; + println!("{}", props_prices2); // --8<-- [end:props_prices2] // --8<-- [start:join-key-expression] + let result = props_groups2 + .clone() + .lazy() + .join( + props_prices2.clone().lazy(), + [col("property_name")], + [col("name").str().to_lowercase()], + JoinArgs::default(), + ) + .collect()?; + println!("{}", result); // --8<-- [end:join-key-expression] // --8<-- [start:inner-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Inner), + ) + .collect()?; + println!("{}", result); // --8<-- [end:inner-join] // --8<-- [start:left-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Left), + ) + .collect()?; + println!("{}", result); // --8<-- [end:left-join] // --8<-- [start:right-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Right), + ) + .collect()?; + println!("{}", result); // --8<-- [end:right-join] // --8<-- [start:left-right-join-equals] + // `equals_missing` is needed instead of `equals` + // so that missing values compare as equal. + let dfs_match = result.equals_missing( + &props_prices + .clone() + .lazy() + .join( + props_groups.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Left), + ) + .select([ + // Reorder the columns to match the order of `result`. + col("group"), + col("property_name"), + col("cost"), + ]) + .collect()?, + ); + println!("{}", dfs_match); // --8<-- [end:left-right-join-equals] // --8<-- [start:full-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Full), + ) + .collect()?; + println!("{}", result); // --8<-- [end:full-join] // --8<-- [start:full-join-coalesce] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), + ) + .collect()?; + println!("{}", result); // --8<-- [end:full-join-coalesce] // --8<-- [start:semi-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Semi), + ) + .collect()?; + println!("{}", result); // --8<-- [end:semi-join] // --8<-- [start:anti-join] + let result = props_groups + .clone() + .lazy() + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Anti), + ) + .collect()?; + println!("{}", result); // --8<-- [end:anti-join] // --8<-- [start:players] + let players = df!( + "name" => ["Alice", "Bob"], + "cash" => [78, 135], + )?; + println!("{}", players); // --8<-- [end:players] // --8<-- [start:non-equi] + let result = players + .clone() + .lazy() + .join_builder() + .with(props_prices.clone().lazy()) + .join_where(vec![col("cash").cast(DataType::Int64).gt(col("cost"))]) + .collect()?; + println!("{}", result); // --8<-- [end:non-equi] // --8<-- [start:df_trades] + use chrono::prelude::*; + + let df_trades = df!( + "time" => [ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock" => ["A", "B", "B", "C"], + "trade" => [101, 299, 301, 500], + )?; + println!("{}", df_trades); // --8<-- [end:df_trades] // --8<-- [start:df_quotes] + let df_quotes = df!( + "time" => [ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock" => ["A", "B", "C", "A"], + "quote" => [100, 300, 501, 102], + )?; + println!("{}", df_quotes); // --8<-- [end:df_quotes] // --8<-- [start:asof] + let result = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + None, + )?; + println!("{}", result); + // --8<-- [end:asof] // --8<-- [start:asof-tolerance] + let result = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), + )?; + println!("{}", result); // --8<-- [end:asof-tolerance] // --8<-- [start:cartesian-product] + let tokens = df!( + "monopoly_token" => ["hat", "shoe", "boat"], + )?; + + let result = players + .clone() + .lazy() + .select([col("name")]) + .cross_join(tokens.clone().lazy(), None) + .collect()?; + println!("{}", result); // --8<-- [end:cartesian-product] Ok(()) From 32aa7a0ead7fc33f25d17188bcd9621502cd6384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:17:37 +0100 Subject: [PATCH 06/11] Add anti, semi, and non-equi joins to Rust docs Without these two flags the Rust API documentation is being built and not including the related join variants. --- crates/polars/Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 31bae99c6146..a710011aa045 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -376,6 +376,8 @@ docs-selection = [ "is_last_distinct", "asof_join", "cross_join", + "semi_anti_join", + "iejoin", "concat_str", "string_reverse", "string_to_integer", From af16607ca42fe2641d2aca3dc888c35d7606d235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:18:31 +0100 Subject: [PATCH 07/11] Update macro to allow language-specific links The macro 'code_block' was updated so that we can specify Python-only or Rust-only API links that may be relevant when the two APIs differ. --- docs/source/_build/scripts/macro.py | 20 +++++++++++++++---- docs/source/development/contributing/index.md | 7 +++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/source/_build/scripts/macro.py b/docs/source/_build/scripts/macro.py index 3b8055074d44..651786b0044b 100644 --- a/docs/source/_build/scripts/macro.py +++ b/docs/source/_build/scripts/macro.py @@ -1,10 +1,12 @@ from collections import OrderedDict import os -from typing import List, Optional, Set +from typing import Any, List, Optional, Set import yaml import logging +from mkdocs_macros.plugin import MacrosPlugin + # Supported Languages and their metadata LANGUAGES = OrderedDict( python={ @@ -130,7 +132,7 @@ def code_tab( """ -def define_env(env): +def define_env(env: MacrosPlugin) -> None: @env.macro def code_header( language: str, section: str = [], api_functions: List[str] = [] @@ -154,7 +156,11 @@ def code_header( @env.macro def code_block( - path: str, section: str = None, api_functions: List[str] = None + path: str, + section: str = None, + api_functions: List[str] = None, + python_api_functions: List[str] = None, + rust_api_functions: List[str] = None, ) -> str: """Dynamically generate a code block for the code located under {language}/path @@ -170,8 +176,14 @@ def code_block( for language, info in LANGUAGES.items(): base_path = f"{language}/{path}{info['extension']}" full_path = "docs/source/src/" + base_path + if language == "python": + extras = python_api_functions or [] + else: + extras = rust_api_functions or [] # Check if file exists for the language if os.path.exists(full_path): - result.append(code_tab(base_path, section, info, api_functions)) + result.append( + code_tab(base_path, section, info, api_functions + extras) + ) return "\n".join(result) diff --git a/docs/source/development/contributing/index.md b/docs/source/development/contributing/index.md index 30fb6ddc0ac9..c3175df9f5b2 100644 --- a/docs/source/development/contributing/index.md +++ b/docs/source/development/contributing/index.md @@ -268,6 +268,13 @@ df = pl.read_parquet("file.parquet") The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. +In some cases, you may need to add links to different functions for the Python and Rust APIs. +When that is the case, you can use the two extra optional arguments that `code_block` accepts, that can be used to pass Python-only and Rust-only links: + +``` +{{code_block('path', 'snippet_name', ['common_api_links'], ['python_only_links'], ['rust_only_links'])}} +``` + #### Linting Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files. From 633da7b0d45b1921a582ac133ba2b5a7602ecaa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:19:36 +0100 Subject: [PATCH 08/11] Include script that loads data when docs are served We do this once as a macro so that downloading only happens when we start serving the docs instead of downloading the data every time the server reloads, which would happen if we were downloading the data in the Python snippets inside /src/python. --- docs/source/_build/scripts/prep_data.py | 28 +++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 29 insertions(+) create mode 100644 docs/source/_build/scripts/prep_data.py diff --git a/docs/source/_build/scripts/prep_data.py b/docs/source/_build/scripts/prep_data.py new file mode 100644 index 000000000000..71e949671027 --- /dev/null +++ b/docs/source/_build/scripts/prep_data.py @@ -0,0 +1,28 @@ +""" +Downloads data once when serving the docs so that subsequent +subsequent rebuilds do not have to access remote resources again. +""" + +import requests + + +DATA = [ + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv", + "docs/assets/data/monopoly_props_groups.csv", + ), + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv", + "docs/assets/data/monopoly_props_prices.csv", + ), +] + + +for url, dest in DATA: + with open(dest, "wb") as f: + try: + f.write(requests.get(url, timeout=10).content) + except Exception as e: + print(f"WARNING: failed to download file {dest} ({e})") + else: + print(f"INFO: downloaded {dest}") diff --git a/mkdocs.yml b/mkdocs.yml index c180bbfc6b8e..b128adca6502 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -173,6 +173,7 @@ markdown_extensions: hooks: - docs/source/_build/scripts/people.py + - docs/source/_build/scripts/prep_data.py plugins: - search: From a5df31eb083aa45a2dc7e98f71fbbf4dc49232ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:22:36 +0100 Subject: [PATCH 09/11] Fix API links and flags. --- docs/source/_build/API_REFERENCE_LINKS.yml | 19 ++++++- .../user-guide/transformations/joins.md | 57 ++++++++++--------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 41eb9a53bd8f..1e301f592cb1 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -181,6 +181,11 @@ rust: link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic feature_flags: [dynamic_group_by] join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join + join-semi_anti_join_flag: + name: join + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join + feature_flags: ["semi_anti_join"] + vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html @@ -194,8 +199,18 @@ rust: pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample - join_asof: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof - join_where: https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where + join_asof_by: + name: join_asof_by + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by + feature_flags: ['asof_join'] + join_where: + name: join_where + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.JoinBuilder.html#method.join_where + feature_flags: ["iejoin"] + cross_join: + name: cross_join + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join + feature_flags: [cross_join] unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest read_csv: diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index e56aa7d82d83..12209ea23a22 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -13,29 +13,31 @@ The table below acts as a quick reference for people who know what they are look If you want to learn about joins in general and how to work with them in Polars, feel free to skip the table and keep reading below. === ":fontawesome-brands-python: Python" -[:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) -[:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) -[:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + + [:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) + [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) + [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) === ":fontawesome-brands-rust: Rust" -[:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) -[:material-api: `join_asof`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) -[:material-flag-plus: Available on feature polars-ops](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag polars-ops"){.feature-flag} + [:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) + ([:material-flag-plus: semi_anti_join](/user-guide/installation/#feature-flags "Enable the feature flag semi_anti_join for semi and for anti joins"){.feature-flag} needed for some options.) + [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) + [:material-flag-plus: Available on feature asof_join](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag asof_join"){.feature-flag} [:material-api: `join_where`](https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where) - [:material-flag-plus: Available on feature lazy](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag lazy"){.feature-flag} - -| Type | Function | Brief description | -| --------------------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Equi inner join | `join(..., how="inner")` | Keeps rows that matched both on the left and right. | -| Equi left outer join | `join(..., how="left")` | Keeps all rows from the left plus matching rows from the right. Non-matching rows from the left have their right columns filled with `null`. | -| Equi right outer join | `join(..., how="right")` | Keeps all rows from the right plus matching rows from the left. Non-matching rows from the right have their left columns filled with `null`. | -| Equi full join | `join(..., how="full")` | Keeps all rows from either dataframe, regardless of whether they match or not. Non-matching rows from one side have the columns from the other side filled with `null`. | -| Equi semi join | `join(..., how="semi")` | Keeps rows from the left that have a match on the right. | -| Equi anti join | `join(..., how="anti")` | Keeps rows from the left that do not have a match on the right. | -| Non-equi inner join | `join_where` | Finds all possible pairings of rows from the left and right that satisfy the given predicate(s). | -| Asof join | `join_asof` | Like a left outer join, but matches on the nearest key instead of on exact key matches. | -| Cartesian product | `join(..., how="cross")` | Computes the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of the two dataframes. | + [:material-flag-plus: Available on feature iejoin](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag iejoin"){.feature-flag} + +| Type | Function | Brief description | +| --------------------- | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Equi inner join | `join(..., how="inner")` | Keeps rows that matched both on the left and right. | +| Equi left outer join | `join(..., how="left")` | Keeps all rows from the left plus matching rows from the right. Non-matching rows from the left have their right columns filled with `null`. | +| Equi right outer join | `join(..., how="right")` | Keeps all rows from the right plus matching rows from the left. Non-matching rows from the right have their left columns filled with `null`. | +| Equi full join | `join(..., how="full")` | Keeps all rows from either dataframe, regardless of whether they match or not. Non-matching rows from one side have the columns from the other side filled with `null`. | +| Equi semi join | `join(..., how="semi")` | Keeps rows from the left that have a match on the right. | +| Equi anti join | `join(..., how="anti")` | Keeps rows from the left that do not have a match on the right. | +| Non-equi inner join | `join_where` | Finds all possible pairings of rows from the left and right that satisfy the given predicate(s). | +| Asof join | `join_asof`/`join_asof_by` | Like a left outer join, but matches on the nearest key instead of on exact key matches. | +| Cartesian product | `join(..., how="cross")` | Computes the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of the two dataframes. | ## Equi joins @@ -74,7 +76,7 @@ By default, Polars computes an “inner join” but there are [other join strate In the example above, the two dataframes conveniently had the column we wish to use as key with the same name and with the values in the exact same format. Suppose, for the sake of argument, that one of the dataframes had a differently named column and the other had the property names in lower case: -{{code_block('user-guide/transformations/joins','props_groups2',[])}} +{{code_block('user-guide/transformations/joins','props_groups2',['Expr.str'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:props_groups2" @@ -88,7 +90,7 @@ Suppose, for the sake of argument, that one of the dataframes had a differently In a situation like this, where we may want to perform the same join as before, we can leverage `join`'s flexibility and specify arbitrary expressions to compute the joining key on the left and on the right, allowing one to compute row keys dynamically: -{{code_block('user-guide/transformations/joins','join-key-expression',['join'])}} +{{code_block('user-guide/transformations/joins', 'join-key-expression', ['join', 'Expr.str'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:join-key-expression" @@ -166,13 +168,13 @@ If we wanted to force `join` to coalesce the two columns `property_name` into a --8<-- "python/user-guide/transformations/joins.py:full-join-coalesce" ``` -When not set, the parameter `coalesce` is join-specific, which is why the inner, left, and right, joins act as if `coalesce=True`, even though we didn't set it. +When not set, the parameter `coalesce` is determined automatically from the join strategy and the key(s) specified, which is why the inner, left, and right, joins acted as if `coalesce=True`, even though we didn't set it. ### Semi join A semi join will return the rows of the left dataframe that have a match in the right dataframe, but we do not actually join the matching rows: -{{code_block('user-guide/transformations/joins','semi-join',['join'])}} +{{code_block('user-guide/transformations/joins', 'semi-join', [], ['join'], ['join-semi_anti_join_flag'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:semi-join" @@ -184,7 +186,7 @@ A semi join acts as a sort of row filter based on a second dataframe. Conversely, an anti join will return the rows of the left dataframe that do not have a match in the right dataframe: -{{code_block('user-guide/transformations/joins','anti-join',['join'])}} +{{code_block('user-guide/transformations/joins', 'anti-join', [], ['join'], ['join-semi_anti_join_flag'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:anti-join" @@ -215,7 +217,8 @@ We use the function `join_where` to compute a non-equi join: You can provide multiple expressions as predicates but they all must use comparison operators that evaluate to a Boolean result and must refer to columns from both dataframes. !!! note -`join_where` is still experimental and doesn't yet support arbitrary Boolean expressions as predicates. + + `join_where` is still experimental and doesn't yet support arbitrary Boolean expressions as predicates. ## Asof join @@ -242,7 +245,7 @@ The broker has another dataframe called `df_quotes` showing prices it has quoted You want to produce a dataframe showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. -{{code_block('user-guide/transformations/joins','asof',['join_asof'])}} +{{code_block('user-guide/transformations/joins','asof', [], ['join_asof'], ['join_asof_by'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof" @@ -262,7 +265,7 @@ In this case we want to make sure that the last preceding quote is within 1 minu Polars allows you to compute the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of two dataframes, producing a dataframe where all rows of the left dataframe are paired up with all the rows of the right dataframe. To compute the Cartesian product of two dataframes, you can pass the strategy `how="cross"` to the function `join` without specifying any of `on`, `left_on`, and `right_on`: -{{code_block('user-guide/transformations/joins','cartesian-product',['join'])}} +{{code_block('user-guide/transformations/joins','cartesian-product',[],['join'],['cross_join'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:cartesian-product" From 2d8c3c72635ac6810de948003c0cf7448a8fafd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Thu, 10 Oct 2024 09:44:09 +0100 Subject: [PATCH 10/11] Download data in the Python script. I was downloading the data in a separate hook but there is a test that just runs all of the Python scripts in src/python and apparently it's not trivial to get the data download to trigger just once, when the appropriate Python script test is running, so it's just easier to move the data download to the script that actually uses it. We also add a comment to the Rust script to direct users to the Python script for the data location. --- docs/source/_build/scripts/prep_data.py | 28 ------------------- .../user-guide/transformations/joins.py | 25 +++++++++++++++++ .../rust/user-guide/transformations/joins.rs | 3 ++ mkdocs.yml | 1 - 4 files changed, 28 insertions(+), 29 deletions(-) delete mode 100644 docs/source/_build/scripts/prep_data.py diff --git a/docs/source/_build/scripts/prep_data.py b/docs/source/_build/scripts/prep_data.py deleted file mode 100644 index 71e949671027..000000000000 --- a/docs/source/_build/scripts/prep_data.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -Downloads data once when serving the docs so that subsequent -subsequent rebuilds do not have to access remote resources again. -""" - -import requests - - -DATA = [ - ( - "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv", - "docs/assets/data/monopoly_props_groups.csv", - ), - ( - "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv", - "docs/assets/data/monopoly_props_prices.csv", - ), -] - - -for url, dest in DATA: - with open(dest, "wb") as f: - try: - f.write(requests.get(url, timeout=10).content) - except Exception as e: - print(f"WARNING: failed to download file {dest} ({e})") - else: - print(f"INFO: downloaded {dest}") diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index e8d13c1dad43..57bb6dd81649 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,3 +1,28 @@ +import requests + + +DATA = [ + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv", + "docs/assets/data/monopoly_props_groups.csv", + ), + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv", + "docs/assets/data/monopoly_props_prices.csv", + ), +] + + +for url, dest in DATA: + with open(dest, "wb") as f: + try: + f.write(requests.get(url, timeout=10).content) + except Exception as e: + print(f"WARNING: failed to download file {dest} ({e})") + else: + print(f"INFO: downloaded {dest}") + + # --8<-- [start:props_groups] import polars as pl diff --git a/docs/source/src/rust/user-guide/transformations/joins.rs b/docs/source/src/rust/user-guide/transformations/joins.rs index 621a8538afae..5d1c50f733b1 100644 --- a/docs/source/src/rust/user-guide/transformations/joins.rs +++ b/docs/source/src/rust/user-guide/transformations/joins.rs @@ -3,6 +3,9 @@ use polars::prelude::*; // --8<-- [end:setup] fn main() -> Result<(), Box> { + // NOTE: This assumes the data has been downloaded and is available. + // See the corresponding Python script for the remote location of the data. + // --8<-- [start:props_groups] let props_groups = CsvReadOptions::default() .with_has_header(true) diff --git a/mkdocs.yml b/mkdocs.yml index b128adca6502..c180bbfc6b8e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -173,7 +173,6 @@ markdown_extensions: hooks: - docs/source/_build/scripts/people.py - - docs/source/_build/scripts/prep_data.py plugins: - search: From 95c6b993b01893b2c1d9f989ef940064e4bb7477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:51:28 +0100 Subject: [PATCH 11/11] Only download data if needed. https://github.com/pola-rs/polars/pull/19127#discussion_r1795234002 --- .../src/python/user-guide/transformations/joins.py | 13 ++++++------- docs/source/user-guide/transformations/joins.md | 3 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index 57bb6dd81649..e44cbdc560c1 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,3 +1,5 @@ +# --8<-- [start:prep-data] +import pathlib import requests @@ -14,14 +16,11 @@ for url, dest in DATA: + if pathlib.Path(dest).exists(): + continue with open(dest, "wb") as f: - try: - f.write(requests.get(url, timeout=10).content) - except Exception as e: - print(f"WARNING: failed to download file {dest} ({e})") - else: - print(f"INFO: downloaded {dest}") - + f.write(requests.get(url, timeout=10).content) +# --8<-- [end:prep-data] # --8<-- [start:props_groups] import polars as pl diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index 12209ea23a22..b135a45f53d3 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -50,6 +50,7 @@ First, we load a dataframe that contains property names and their colour group i {{code_block('user-guide/transformations/joins','props_groups',[])}} ```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:prep-data" --8<-- "python/user-guide/transformations/joins.py:props_groups" ``` @@ -254,7 +255,7 @@ To avoid joining between trades on one stock with a quote on another you must sp If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. -{{code_block('user-guide/transformations/joins','asof-tolerance',['join_asof'])}} +{{code_block('user-guide/transformations/joins','asof-tolerance', [], ['join_asof'], ['join_asof_by'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof-tolerance"