From 919e5bdd19b939e1bac737287474de98e9836714 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sat, 13 Jul 2024 04:17:56 +0000 Subject: [PATCH 1/2] fix(rust!, python): reject literal input in `sort_by_exprs()` --- crates/polars-lazy/src/frame/mod.rs | 35 +++++++++----- crates/polars-lazy/src/tests/queries.rs | 2 +- crates/polars-lazy/src/tests/streaming.rs | 8 ++-- crates/polars-lazy/src/tests/tpch.rs | 2 +- crates/polars-sql/src/context.rs | 4 +- crates/polars-sql/tests/ops_distinct_on.rs | 8 ++-- crates/polars-sql/tests/simple_exprs.rs | 2 +- crates/polars/src/docs/lazy.rs | 2 +- py-polars/src/lazyframe/mod.rs | 56 ++++++++++++---------- py-polars/tests/unit/dataframe/test_df.py | 4 ++ 10 files changed, 72 insertions(+), 51 deletions(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 9986604ac878..8668ace72014 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -341,21 +341,28 @@ impl LazyFrame { /// /// Sort DataFrame by 'sepal_width' column /// fn example(df: DataFrame) -> LazyFrame { /// df.lazy() - /// .sort_by_exprs(vec![col("sepal_width")], Default::default()) + /// .sort_by_exprs(vec![col("sepal_width")], Default::default())? /// } /// ``` pub fn sort_by_exprs>( self, by_exprs: E, sort_options: SortMultipleOptions, - ) -> Self { + ) -> PolarsResult { let by_exprs = by_exprs.as_ref().to_vec(); if by_exprs.is_empty() { - self + Ok(self) } else { + let is_include_literal = by_exprs.iter().any(|expr| matches!(expr, Expr::Literal(_))); + + polars_ensure!( + !is_include_literal, + ComputeError: "literal expressions are not allowed for sorting" + ); + let opt_state = self.get_opt_state(); let lp = self.get_plan_builder().sort(by_exprs, sort_options).build(); - Self::from_logical_plan(lp, opt_state) + Ok(Self::from_logical_plan(lp, opt_state)) } } @@ -364,13 +371,14 @@ impl LazyFrame { k: IdxSize, by_exprs: E, sort_options: SortMultipleOptions, - ) -> Self { + ) -> PolarsResult { // this will optimize to top-k - self.sort_by_exprs( - by_exprs, - sort_options.with_order_reversed().with_nulls_last(true), - ) - .slice(0, k) + Ok(self + .sort_by_exprs( + by_exprs, + sort_options.with_order_reversed().with_nulls_last(true), + )? + .slice(0, k)) } pub fn bottom_k>( @@ -378,10 +386,11 @@ impl LazyFrame { k: IdxSize, by_exprs: E, sort_options: SortMultipleOptions, - ) -> Self { + ) -> PolarsResult { // this will optimize to bottom-k - self.sort_by_exprs(by_exprs, sort_options.with_nulls_last(true)) - .slice(0, k) + Ok(self + .sort_by_exprs(by_exprs, sort_options.with_nulls_last(true))? + .slice(0, k)) } /// Reverse the `DataFrame` from top to bottom. diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index ade6df69c57e..11bf8a7b8bdd 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -1911,7 +1911,7 @@ fn test_sort_maintain_order_true() -> PolarsResult<()> { SortMultipleOptions::default() .with_maintain_order(true) .with_nulls_last(true), - ) + )? .slice(0, 3) .collect()?; println!("{:?}", res); diff --git a/crates/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs index d8d76384ed0c..531141b2c886 100644 --- a/crates/polars-lazy/src/tests/streaming.rs +++ b/crates/polars-lazy/src/tests/streaming.rs @@ -105,7 +105,7 @@ fn test_streaming_multiple_keys_aggregate() -> PolarsResult<()> { .sort_by_exprs( [col("sugars_g"), col("calories")], SortMultipleOptions::default().with_order_descending_multi([false, false]), - ); + )?; assert_streaming_with_default(q, true, false); Ok(()) @@ -138,7 +138,7 @@ fn test_streaming_unique() -> PolarsResult<()> { .sort_by_exprs( [cols(["sugars_g", "calories"])], SortMultipleOptions::default(), - ); + )?; assert_streaming_with_default(q, true, false); Ok(()) @@ -392,7 +392,7 @@ fn test_sort_maintain_order_streaming() -> PolarsResult<()> { SortMultipleOptions::default() .with_nulls_last(true) .with_maintain_order(true), - ) + )? .slice(0, 3) .with_streaming(true) .collect()?; @@ -419,7 +419,7 @@ fn test_streaming_full_outer_join() -> PolarsResult<()> { let q = lf_left .full_join(lf_right, col("a"), col("a")) - .sort_by_exprs([all()], SortMultipleOptions::default()); + .sort_by_exprs([all()], SortMultipleOptions::default())?; // Toggle so that the join order is swapped. for toggle in [true, true] { diff --git a/crates/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/tpch.rs index 49eed184f72a..8c2abb8f54f5 100644 --- a/crates/polars-lazy/src/tests/tpch.rs +++ b/crates/polars-lazy/src/tests/tpch.rs @@ -81,7 +81,7 @@ fn test_q2() -> PolarsResult<()> { SortMultipleOptions::default() .with_order_descending_multi([true, false, false, false]) .with_maintain_order(true), - ) + )? .limit(100) .with_comm_subplan_elim(true); diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 3b2c3c43c919..54b072e60caa 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -1098,13 +1098,13 @@ impl SQLContext { )?) } } - Ok(lf.sort_by_exprs( + lf.sort_by_exprs( &by, SortMultipleOptions::default() .with_order_descending_multi(descending) .with_nulls_last_multi(nulls_last) .with_maintain_order(true), - )) + ) } fn process_group_by( diff --git a/crates/polars-sql/tests/ops_distinct_on.rs b/crates/polars-sql/tests/ops_distinct_on.rs index 6d9c81a7be41..1b110ce6d60b 100644 --- a/crates/polars-sql/tests/ops_distinct_on.rs +++ b/crates/polars-sql/tests/ops_distinct_on.rs @@ -1,10 +1,11 @@ use polars_core::chunked_array::ops::SortMultipleOptions; use polars_core::df; +use polars_error::PolarsResult; use polars_lazy::prelude::*; use polars_sql::*; #[test] -fn test_distinct_on() { +fn test_distinct_on() -> PolarsResult<()> { let df = df! { "Name" => ["Bob", "Pete", "Pete", "Pete", "Martha", "Martha"], "Record Date" => [1, 1, 2, 4, 1, 3], @@ -33,9 +34,10 @@ fn test_distinct_on() { SortMultipleOptions::default() .with_order_descending_multi([false, true]) .with_maintain_order(true), - ) + )? .group_by_stable(vec![col("Name")]) .agg(vec![col("*").first()]); let expected = expected.collect().unwrap(); - assert!(actual.equals(&expected)) + assert!(actual.equals(&expected)); + Ok(()) } diff --git a/crates/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs index c37e12ed3040..b63940b05034 100644 --- a/crates/polars-sql/tests/simple_exprs.rs +++ b/crates/polars-sql/tests/simple_exprs.rs @@ -603,7 +603,7 @@ fn test_group_by_2() -> PolarsResult<()> { .sort_by_exprs( vec![col("count"), col("category")], SortMultipleOptions::default().with_order_descending_multi([false, true]), - ) + )? .limit(2); let expected = expected.collect()?; diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index c91367490130..86f8d155156c 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -85,7 +85,7 @@ //! let descending = vec![true, false]; //! //! let sorted = df.lazy() -//! .sort_by_exprs(vec![col("b"), col("a")], descending, false, false) +//! .sort_by_exprs(vec![col("b"), col("a")], descending, false, false)? //! .collect()?; //! //! // sorted: diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index 635f89dfd38a..1d6d43a006b4 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -511,41 +511,47 @@ impl PyLazyFrame { nulls_last: Vec, maintain_order: bool, multithreaded: bool, - ) -> Self { + ) -> PyResult { let ldf = self.ldf.clone(); let exprs = by.to_exprs(); - ldf.sort_by_exprs( - exprs, - SortMultipleOptions { - descending, - nulls_last, - maintain_order, - multithreaded, - }, - ) - .into() + let out = ldf + .sort_by_exprs( + exprs, + SortMultipleOptions { + descending, + nulls_last, + maintain_order, + multithreaded, + }, + ) + .map_err(PyPolarsErr::from)?; + Ok(out.into()) } - fn top_k(&self, k: IdxSize, by: Vec, reverse: Vec) -> Self { + fn top_k(&self, k: IdxSize, by: Vec, reverse: Vec) -> PyResult { let ldf = self.ldf.clone(); let exprs = by.to_exprs(); - ldf.top_k( - k, - exprs, - SortMultipleOptions::new().with_order_descending_multi(reverse), - ) - .into() + let out = ldf + .top_k( + k, + exprs, + SortMultipleOptions::new().with_order_descending_multi(reverse), + ) + .map_err(PyPolarsErr::from)?; + Ok(out.into()) } - fn bottom_k(&self, k: IdxSize, by: Vec, reverse: Vec) -> Self { + fn bottom_k(&self, k: IdxSize, by: Vec, reverse: Vec) -> PyResult { let ldf = self.ldf.clone(); let exprs = by.to_exprs(); - ldf.bottom_k( - k, - exprs, - SortMultipleOptions::new().with_order_descending_multi(reverse), - ) - .into() + let out = ldf + .bottom_k( + k, + exprs, + SortMultipleOptions::new().with_order_descending_multi(reverse), + ) + .map_err(PyPolarsErr::from)?; + Ok(out.into()) } fn cache(&self) -> Self { diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index e6e4c64a11c1..f381b7b75ca6 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -271,6 +271,10 @@ def test_sort() -> None: assert_frame_equal( df.sort(["a", "b"]), pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]}) ) + with pytest.raises(ComputeError): + df.sort([1, 2]) + with pytest.raises(ComputeError): + df.sort(1) def test_sort_maintain_order() -> None: From a3423d1a542f727a0529944046250b07b292c8d5 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sat, 13 Jul 2024 04:28:28 +0000 Subject: [PATCH 2/2] docs(rust): fix example --- crates/polars-lazy/src/frame/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 8668ace72014..a9aa7e5ea80f 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -341,7 +341,8 @@ impl LazyFrame { /// /// Sort DataFrame by 'sepal_width' column /// fn example(df: DataFrame) -> LazyFrame { /// df.lazy() - /// .sort_by_exprs(vec![col("sepal_width")], Default::default())? + /// .sort_by_exprs(vec![col("sepal_width")], Default::default()) + /// .unwrap() /// } /// ``` pub fn sort_by_exprs>(