From e11a0dc3cd25c4d6272728e780419bb3e0f7d93e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Mon, 23 Dec 2024 10:22:40 +0100 Subject: [PATCH] fix to physical --- .../src/chunked_array/array/mod.rs | 37 ++++++ .../polars-core/src/chunked_array/list/mod.rs | 33 ++++++ .../src/chunked_array/struct_/mod.rs | 47 ++++++-- crates/polars-core/src/series/mod.rs | 107 ++---------------- crates/polars-expr/src/groups/row_encoded.rs | 2 +- 5 files changed, 122 insertions(+), 104 deletions(-) diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 291e51ec1532..b1992119e4b5 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -2,6 +2,8 @@ mod iterator; +use std::borrow::Cow; + use crate::prelude::*; impl ArrayChunked { @@ -29,6 +31,41 @@ impl ArrayChunked { fld.coerce(DataType::Array(Box::new(inner_dtype), width)) } + /// Convert the datatype of the array into the physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else { + return Cow::Borrowed(self); + }; + + assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + + let width = self.width(); + let chunks: Vec<_> = self + .downcast_iter() + .zip(physical_repr.into_chunks()) + .map(|(chunk, values)| { + FixedSizeListArray::new( + ArrowDataType::FixedSizeList(Box::new(ArrowField::new( + PlSmallStr::from_static("item"), + values.dtype().clone(), + true, + )), width), + chunk.len(), + values, + chunk.validity().cloned(), + ) + .to_boxed() + }) + .collect(); + + let name = self.name().clone(); + let dtype = DataType::Array(Box::new(self.inner_dtype().to_physical()), width); + Cow::Owned(unsafe { + ArrayChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) + }) + } + + /// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting. /// /// # Safety diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index 8e3a67b348b7..3bc08f99a70d 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -1,6 +1,8 @@ //! Special list utility methods pub(super) mod iterator; +use std::borrow::Cow; + use crate::prelude::*; impl ListChunked { @@ -36,6 +38,37 @@ impl ListChunked { fld.coerce(DataType::List(Box::new(inner_dtype))) } + /// Convert the datatype of the list into the physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else { + return Cow::Borrowed(self); + }; + + let chunks: Vec<_> = self + .downcast_iter() + .zip(physical_repr.into_chunks()) + .map(|(chunk, values)| { + LargeListArray::new( + ArrowDataType::LargeList(Box::new(ArrowField::new( + PlSmallStr::from_static("item"), + values.dtype().clone(), + true, + ))), + chunk.offsets().clone(), + values, + chunk.validity().cloned(), + ) + .to_boxed() + }) + .collect(); + + let name = self.name().clone(); + let dtype = DataType::List(Box::new(self.inner_dtype().to_physical())); + Cow::Owned(unsafe { + ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) + }) + } + /// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting. /// /// # Safety diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 2dc07984d8af..ee5ad7dedc8c 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -1,5 +1,6 @@ mod frame; +use std::borrow::Cow; use std::fmt::Write; use arrow::array::StructArray; @@ -22,14 +23,14 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( name: PlSmallStr, length: usize, fields: I, -) -> PolarsResult { +) -> StructChunked { if fields.len() == 0 { let dtype = DataType::Struct(Vec::new()); let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); let chunks = vec![StructArray::new(arrow_dtype, length, Vec::new(), None).boxed()]; // SAFETY: We construct each chunk above to have the `Struct` data type. - return Ok(unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) }); + return unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) }; } // Different chunk lengths: rechunk and recurse. @@ -61,9 +62,9 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( Ok(chunks) => { // SAFETY: invariants checked above. unsafe { - Ok(StructChunked::from_chunks_and_dtype_unchecked( + StructChunked::from_chunks_and_dtype_unchecked( name, chunks, dtype, - )) + ) } }, // Different chunk lengths: rechunk and recurse. @@ -117,14 +118,14 @@ impl StructChunked { } if !needs_to_broadcast { - return constructor(name, length, fields); + return Ok(constructor(name, length, fields)); } if length == 0 { // @NOTE: There are columns that are being broadcasted so we need to clear those. let new_fields = fields.map(|s| s.clear()).collect::>(); - return constructor(name, length, new_fields.iter()); + return Ok(constructor(name, length, new_fields.iter())); } let new_fields = fields @@ -136,7 +137,39 @@ impl StructChunked { } }) .collect::>(); - constructor(name, length, new_fields.iter()) + Ok(constructor(name, length, new_fields.iter())) + } + + /// Convert a struct to the underlying physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let mut physicals = Vec::new(); + + let field_series = self.fields_as_series(); + for (i, s) in field_series.iter().enumerate() { + if let Cow::Owned(physical) = s.to_physical_repr() { + physicals.reserve(field_series.len()); + physicals.extend(field_series[..i].iter().cloned()); + physicals.push(physical); + break; + } + } + + if physicals.is_empty() { + return Cow::Borrowed(self); + } + + physicals.extend( + field_series[physicals.len()..] + .iter() + .map(|s| s.to_physical_repr().into_owned()), + ); + + let mut ca = constructor(self.name().clone(), self.length, physicals.iter()); + if self.null_count() > 0 { + ca.zip_outer_validity(self); + } + + Cow::Owned(ca) } /// Convert a non-logical [`StructChunked`] back into a logical [`StructChunked`] without casting. diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 5a3aacf81453..01dbcf33db33 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -35,7 +35,6 @@ pub use from::*; pub use iterator::{SeriesIter, SeriesPhysIter}; use num_traits::NumCast; use polars_error::feature_gated; -use polars_utils::itertools::Itertools; pub use series_trait::{IsSorted, *}; use crate::chunked_array::cast::CastOptions; @@ -687,105 +686,21 @@ impl Series { }, #[cfg(feature = "dtype-decimal")] Decimal(_, _) => Cow::Owned(self.decimal().unwrap().0.clone().into_series()), - List(inner) => Cow::Owned(self.cast(&List(Box::new(inner.to_physical()))).unwrap()), - #[cfg(feature = "dtype-array")] - Array(inner, size) => Cow::Owned( - self.cast(&Array(Box::new(inner.to_physical()), *size)) - .unwrap(), - ), - #[cfg(feature = "dtype-struct")] - Struct(_) => { - let arr = self.struct_().unwrap(); - let fields: Vec<_> = arr - .fields_as_series() - .iter() - .map(|s| s.to_physical_repr().into_owned()) - .collect(); - let mut ca = - StructChunked::from_series(self.name().clone(), arr.len(), fields.iter()) - .unwrap(); - - if arr.null_count() > 0 { - ca.zip_outer_validity(arr); - } - Cow::Owned(ca.into_series()) + List(_) => match self.list().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, - _ => Cow::Borrowed(self), - } - } - - /// Attempts to convert a Series to dtype, only allowing conversions from - /// physical to logical dtypes--the inverse of to_physical_repr(). - /// - /// # Safety - /// When converting from UInt32 to Categorical it is not checked that the - /// values are in-bound for the categorical mapping. - pub unsafe fn to_logical_repr_unchecked(&self, dtype: &DataType) -> PolarsResult { - use DataType::*; - - let err = || { - Err( - polars_err!(ComputeError: "can't cast from {} to {} in to_logical_repr_unchecked", self.dtype(), dtype), - ) - }; - - match dtype { - dt if self.dtype() == dt => Ok(self.clone()), - #[cfg(feature = "dtype-date")] - Date => Ok(self.i32()?.clone().into_date().into_series()), - #[cfg(feature = "dtype-datetime")] - Datetime(u, z) => Ok(self - .i64()? - .clone() - .into_datetime(*u, z.clone()) - .into_series()), - #[cfg(feature = "dtype-duration")] - Duration(u) => Ok(self.i64()?.clone().into_duration(*u).into_series()), - #[cfg(feature = "dtype-time")] - Time => Ok(self.i64()?.clone().into_time().into_series()), - #[cfg(feature = "dtype-decimal")] - Decimal(precision, scale) => Ok(self - .i128()? - .clone() - .into_decimal(*precision, scale.unwrap())? - .into_series()), - #[cfg(feature = "dtype-categorical")] - Categorical { .. } | Enum { .. } => { - Ok(CategoricalChunked::from_cats_and_dtype_unchecked( - self.u32()?.clone(), - dtype.clone(), - ) - .into_series()) - }, - List(inner) => { - if let List(self_inner) = self.dtype() { - if inner.to_physical() == **self_inner { - return self.cast(dtype); - } - } - err() + #[cfg(feature = "dtype-array")] + Array(_, _) => match self.array().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, #[cfg(feature = "dtype-struct")] - Struct(target_fields) => { - let ca = self.struct_().unwrap(); - if ca.struct_fields().len() != target_fields.len() { - return err(); - } - let fields = ca - .fields_as_series() - .iter() - .zip(target_fields) - .map(|(s, tf)| s.to_logical_repr_unchecked(tf.dtype())) - .try_collect_vec()?; - let mut result = - StructChunked::from_series(self.name().clone(), ca.len(), fields.iter())?; - if ca.null_count() > 0 { - result.zip_outer_validity(ca); - } - Ok(result.into_series()) + Struct(_) => match self.struct_().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, - - _ => err(), + _ => Cow::Borrowed(self), } } diff --git a/crates/polars-expr/src/groups/row_encoded.rs b/crates/polars-expr/src/groups/row_encoded.rs index e51f82f85004..885f8c6114e7 100644 --- a/crates/polars-expr/src/groups/row_encoded.rs +++ b/crates/polars-expr/src/groups/row_encoded.rs @@ -54,7 +54,7 @@ impl RowEncodedHashGrouper { .zip(key_columns) .map(|((name, dt), col)| { let s = Series::try_from((name.clone(), col)).unwrap(); - unsafe { s.to_logical_repr_unchecked(dt) } + unsafe { s.from_physical_unchecked(dt) } .unwrap() .into_column() })