Skip to content

Commit

Permalink
fix to physical
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite committed Dec 23, 2024
1 parent 6218b18 commit e11a0dc
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 104 deletions.
37 changes: 37 additions & 0 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ArrayChunked {
Expand Down Expand Up @@ -29,6 +31,41 @@ impl ArrayChunked {
fld.coerce(DataType::Array(Box::new(inner_dtype), width))
}

/// Convert the datatype of the array into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ArrayChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());

let width = self.width();
let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
FixedSizeListArray::new(
ArrowDataType::FixedSizeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
)), width),
chunk.len(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::Array(Box::new(self.inner_dtype().to_physical()), width);
Cow::Owned(unsafe {
ArrayChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype)
})
}


/// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting.
///
/// # Safety
Expand Down
33 changes: 33 additions & 0 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! Special list utility methods
pub(super) mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ListChunked {
Expand Down Expand Up @@ -36,6 +38,37 @@ impl ListChunked {
fld.coerce(DataType::List(Box::new(inner_dtype)))
}

/// Convert the datatype of the list into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ListChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
LargeListArray::new(
ArrowDataType::LargeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
))),
chunk.offsets().clone(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::List(Box::new(self.inner_dtype().to_physical()));
Cow::Owned(unsafe {
ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype)
})
}

/// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting.
///
/// # Safety
Expand Down
47 changes: 40 additions & 7 deletions crates/polars-core/src/chunked_array/struct_/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod frame;

use std::borrow::Cow;
use std::fmt::Write;

use arrow::array::StructArray;
Expand All @@ -22,14 +23,14 @@ fn constructor<'a, I: ExactSizeIterator<Item = &'a Series> + Clone>(
name: PlSmallStr,
length: usize,
fields: I,
) -> PolarsResult<StructChunked> {
) -> StructChunked {
if fields.len() == 0 {
let dtype = DataType::Struct(Vec::new());
let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
let chunks = vec![StructArray::new(arrow_dtype, length, Vec::new(), None).boxed()];

// SAFETY: We construct each chunk above to have the `Struct` data type.
return Ok(unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) });
return unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) };
}

// Different chunk lengths: rechunk and recurse.
Expand Down Expand Up @@ -61,9 +62,9 @@ fn constructor<'a, I: ExactSizeIterator<Item = &'a Series> + Clone>(
Ok(chunks) => {
// SAFETY: invariants checked above.
unsafe {
Ok(StructChunked::from_chunks_and_dtype_unchecked(
StructChunked::from_chunks_and_dtype_unchecked(
name, chunks, dtype,
))
)
}
},
// Different chunk lengths: rechunk and recurse.
Expand Down Expand Up @@ -117,14 +118,14 @@ impl StructChunked {
}

if !needs_to_broadcast {
return constructor(name, length, fields);
return Ok(constructor(name, length, fields));
}

if length == 0 {
// @NOTE: There are columns that are being broadcasted so we need to clear those.
let new_fields = fields.map(|s| s.clear()).collect::<Vec<_>>();

return constructor(name, length, new_fields.iter());
return Ok(constructor(name, length, new_fields.iter()));
}

let new_fields = fields
Expand All @@ -136,7 +137,39 @@ impl StructChunked {
}
})
.collect::<Vec<_>>();
constructor(name, length, new_fields.iter())
Ok(constructor(name, length, new_fields.iter()))
}

/// Convert a struct to the underlying physical datatype.
pub fn to_physical_repr(&self) -> Cow<StructChunked> {
let mut physicals = Vec::new();

let field_series = self.fields_as_series();
for (i, s) in field_series.iter().enumerate() {
if let Cow::Owned(physical) = s.to_physical_repr() {
physicals.reserve(field_series.len());
physicals.extend(field_series[..i].iter().cloned());
physicals.push(physical);
break;
}
}

if physicals.is_empty() {
return Cow::Borrowed(self);
}

physicals.extend(
field_series[physicals.len()..]
.iter()
.map(|s| s.to_physical_repr().into_owned()),
);

let mut ca = constructor(self.name().clone(), self.length, physicals.iter());
if self.null_count() > 0 {
ca.zip_outer_validity(self);
}

Cow::Owned(ca)
}

/// Convert a non-logical [`StructChunked`] back into a logical [`StructChunked`] without casting.
Expand Down
107 changes: 11 additions & 96 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ pub use from::*;
pub use iterator::{SeriesIter, SeriesPhysIter};
use num_traits::NumCast;
use polars_error::feature_gated;
use polars_utils::itertools::Itertools;
pub use series_trait::{IsSorted, *};

use crate::chunked_array::cast::CastOptions;
Expand Down Expand Up @@ -687,105 +686,21 @@ impl Series {
},
#[cfg(feature = "dtype-decimal")]
Decimal(_, _) => Cow::Owned(self.decimal().unwrap().0.clone().into_series()),
List(inner) => Cow::Owned(self.cast(&List(Box::new(inner.to_physical()))).unwrap()),
#[cfg(feature = "dtype-array")]
Array(inner, size) => Cow::Owned(
self.cast(&Array(Box::new(inner.to_physical()), *size))
.unwrap(),
),
#[cfg(feature = "dtype-struct")]
Struct(_) => {
let arr = self.struct_().unwrap();
let fields: Vec<_> = arr
.fields_as_series()
.iter()
.map(|s| s.to_physical_repr().into_owned())
.collect();
let mut ca =
StructChunked::from_series(self.name().clone(), arr.len(), fields.iter())
.unwrap();

if arr.null_count() > 0 {
ca.zip_outer_validity(arr);
}
Cow::Owned(ca.into_series())
List(_) => match self.list().unwrap().to_physical_repr() {
Cow::Borrowed(_) => Cow::Borrowed(self),
Cow::Owned(ca) => Cow::Owned(ca.into_series()),
},
_ => Cow::Borrowed(self),
}
}

/// Attempts to convert a Series to dtype, only allowing conversions from
/// physical to logical dtypes--the inverse of to_physical_repr().
///
/// # Safety
/// When converting from UInt32 to Categorical it is not checked that the
/// values are in-bound for the categorical mapping.
pub unsafe fn to_logical_repr_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {
use DataType::*;

let err = || {
Err(
polars_err!(ComputeError: "can't cast from {} to {} in to_logical_repr_unchecked", self.dtype(), dtype),
)
};

match dtype {
dt if self.dtype() == dt => Ok(self.clone()),
#[cfg(feature = "dtype-date")]
Date => Ok(self.i32()?.clone().into_date().into_series()),
#[cfg(feature = "dtype-datetime")]
Datetime(u, z) => Ok(self
.i64()?
.clone()
.into_datetime(*u, z.clone())
.into_series()),
#[cfg(feature = "dtype-duration")]
Duration(u) => Ok(self.i64()?.clone().into_duration(*u).into_series()),
#[cfg(feature = "dtype-time")]
Time => Ok(self.i64()?.clone().into_time().into_series()),
#[cfg(feature = "dtype-decimal")]
Decimal(precision, scale) => Ok(self
.i128()?
.clone()
.into_decimal(*precision, scale.unwrap())?
.into_series()),
#[cfg(feature = "dtype-categorical")]
Categorical { .. } | Enum { .. } => {
Ok(CategoricalChunked::from_cats_and_dtype_unchecked(
self.u32()?.clone(),
dtype.clone(),
)
.into_series())
},
List(inner) => {
if let List(self_inner) = self.dtype() {
if inner.to_physical() == **self_inner {
return self.cast(dtype);
}
}
err()
#[cfg(feature = "dtype-array")]
Array(_, _) => match self.array().unwrap().to_physical_repr() {
Cow::Borrowed(_) => Cow::Borrowed(self),
Cow::Owned(ca) => Cow::Owned(ca.into_series()),
},
#[cfg(feature = "dtype-struct")]
Struct(target_fields) => {
let ca = self.struct_().unwrap();
if ca.struct_fields().len() != target_fields.len() {
return err();
}
let fields = ca
.fields_as_series()
.iter()
.zip(target_fields)
.map(|(s, tf)| s.to_logical_repr_unchecked(tf.dtype()))
.try_collect_vec()?;
let mut result =
StructChunked::from_series(self.name().clone(), ca.len(), fields.iter())?;
if ca.null_count() > 0 {
result.zip_outer_validity(ca);
}
Ok(result.into_series())
Struct(_) => match self.struct_().unwrap().to_physical_repr() {
Cow::Borrowed(_) => Cow::Borrowed(self),
Cow::Owned(ca) => Cow::Owned(ca.into_series()),
},

_ => err(),
_ => Cow::Borrowed(self),
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-expr/src/groups/row_encoded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl RowEncodedHashGrouper {
.zip(key_columns)
.map(|((name, dt), col)| {
let s = Series::try_from((name.clone(), col)).unwrap();
unsafe { s.to_logical_repr_unchecked(dt) }
unsafe { s.from_physical_unchecked(dt) }
.unwrap()
.into_column()
})
Expand Down

0 comments on commit e11a0dc

Please sign in to comment.