Skip to content

Commit

Permalink
rename dictionary to context
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite committed Dec 23, 2024
1 parent 62ebbe5 commit 6218b18
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 43 deletions.
46 changes: 25 additions & 21 deletions crates/polars-core/src/chunked_array/ops/row_encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls(
))
}

pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContext> {
/// Get the [`RowEncodingContext`] for a certain [`DataType`].
///
/// This should be given the logical type in order to communicate Polars datatype information down
/// into the row encoding / decoding.
pub fn get_row_encoding_context(dtype: &DataType) -> Option<RowEncodingContext> {
match dtype {
DataType::Boolean
| DataType::UInt8
Expand Down Expand Up @@ -104,8 +108,8 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},

#[cfg(feature = "dtype-array")]
DataType::Array(dtype, _) => get_row_encoding_dictionary(dtype),
DataType::List(dtype) => get_row_encoding_dictionary(dtype),
DataType::Array(dtype, _) => get_row_encoding_context(dtype),
DataType::List(dtype) => get_row_encoding_context(dtype),
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(revmap, ordering) | DataType::Enum(revmap, ordering) => {
let revmap = revmap.as_ref().unwrap();
Expand Down Expand Up @@ -161,28 +165,28 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},
#[cfg(feature = "dtype-struct")]
DataType::Struct(fs) => {
let mut out = Vec::new();
let mut ctxts = Vec::new();

for (i, f) in fs.iter().enumerate() {
if let Some(dict) = get_row_encoding_dictionary(f.dtype()) {
out.reserve(fs.len());
out.extend(std::iter::repeat_n(None, i));
out.push(Some(dict));
if let Some(ctxt) = get_row_encoding_context(f.dtype()) {
ctxts.reserve(fs.len());
ctxts.extend(std::iter::repeat_n(None, i));
ctxts.push(Some(ctxt));
break;
}
}

if out.is_empty() {
if ctxts.is_empty() {
return None;
}

out.extend(
fs[out.len()..]
ctxts.extend(
fs[ctxts.len()..]
.iter()
.map(|f| get_row_encoding_dictionary(f.dtype())),
.map(|f| get_row_encoding_context(f.dtype())),
);

Some(RowEncodingContext::Struct(out))
Some(RowEncodingContext::Struct(ctxts))
},
}
}
Expand All @@ -198,7 +202,7 @@ pub fn encode_rows_unordered(by: &[Column]) -> PolarsResult<BinaryOffsetChunked>
pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -210,13 +214,13 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_unsorted();
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded(
Expand All @@ -229,7 +233,7 @@ pub fn _get_rows_encoded(

let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -241,13 +245,13 @@ pub fn _get_rows_encoded(
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_sorted(*desc, *null_last);
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded_ca(
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-expr/src/groups/row_encoded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use polars_utils::cardinality_sketch::CardinalitySketch;
use polars_utils::idx_map::bytes_idx_map::{BytesIndexMap, Entry};
use polars_utils::vec::PushUnchecked;

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::*;
use crate::hash_keys::HashKeys;

Expand Down Expand Up @@ -39,14 +39,14 @@ impl RowEncodedHashGrouper {
.iter()
.map(|(_name, dt)| dt.to_physical().to_arrow(CompatLevel::newest()))
.collect::<Vec<_>>();
let dicts = self
let ctxts = self
.key_schema
.iter()
.map(|(_, dt)| get_row_encoding_dictionary(dt))
.map(|(_, dt)| get_row_encoding_context(dt))
.collect::<Vec<_>>();
let fields = vec![RowEncodingOptions::new_unsorted(); key_dtypes.len()];
let key_columns =
unsafe { polars_row::decode::decode_rows(&mut key_rows, &fields, &dicts, &key_dtypes) };
unsafe { polars_row::decode::decode_rows(&mut key_rows, &fields, &ctxts, &key_dtypes) };

let cols = self
.key_schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::cell::UnsafeCell;

use polars_row::{RowEncodingOptions, RowsEncoded};

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::*;
use crate::executors::sinks::group_by::utils::prepare_key;
use crate::executors::sinks::utils::hash_rows;
Expand Down Expand Up @@ -77,7 +77,7 @@ impl Eval {
let mut dicts = Vec::with_capacity(self.key_columns_expr.len());
for phys_e in self.key_columns_expr.iter() {
let s = phys_e.evaluate(chunk, &context.execution_state)?;
dicts.push(get_row_encoding_dictionary(s.dtype()));
dicts.push(get_row_encoding_context(s.dtype()));
let s = s.to_physical_repr().into_owned();
let s = prepare_key(&s, chunk);
keys_columns.push(s.to_arrow(0, CompatLevel::newest()));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use arrow::legacy::trusted_len::TrustedLenPush;
use polars_utils::hashing::hash_to_partition;

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::*;
use crate::pipeline::PARTITION_SIZE;

Expand Down Expand Up @@ -262,7 +262,7 @@ impl<const FIXED: bool> AggHashTable<FIXED> {
.output_schema
.iter_values()
.take(self.num_keys)
.map(get_row_encoding_dictionary)
.map(get_row_encoding_context)
.collect::<Vec<_>>();
let fields = vec![Default::default(); self.num_keys];
let key_columns =
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-pipe/src/executors/sinks/joins/generic_build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use polars_utils::arena::Node;
use polars_utils::pl_str::PlSmallStr;
use polars_utils::unitvec;

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::*;
use crate::executors::operators::PlaceHolder;
use crate::executors::sinks::joins::generic_probe_inner_left::GenericJoinProbe;
Expand Down Expand Up @@ -137,17 +137,17 @@ impl<K: ExtraPayload> GenericBuild<K> {
chunk: &DataChunk,
) -> PolarsResult<&BinaryArray<i64>> {
debug_assert!(self.join_columns.is_empty());
let mut dicts = Vec::with_capacity(self.join_columns_left.len());
let mut ctxts = Vec::with_capacity(self.join_columns_left.len());
for phys_e in self.join_columns_left.iter() {
let s = phys_e.evaluate(chunk, &context.execution_state)?;
let arr = s.to_physical_repr().rechunk().array_ref(0).clone();
self.join_columns.push(arr);
dicts.push(get_row_encoding_dictionary(s.dtype()));
ctxts.push(get_row_encoding_context(s.dtype()));
}
let rows_encoded = polars_row::convert_columns_no_order(
self.join_columns[0].len(), // @NOTE: does not work for ZFS
&self.join_columns,
&dicts,
&ctxts,
)
.into_array();
self.materialized_join_cols.push(rows_encoded);
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-pipe/src/executors/sinks/joins/row_values.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::sync::Arc;
use arrow::array::{ArrayRef, BinaryArray, StaticArray};
use arrow::compute::utils::combine_validities_and_many;
use polars_core::error::PolarsResult;
use polars_core::prelude::row_encode::get_row_encoding_dictionary;
use polars_core::prelude::row_encode::get_row_encoding_context;
use polars_row::RowsEncoded;

use crate::expressions::PhysicalPipedExpr;
Expand Down Expand Up @@ -49,7 +49,7 @@ impl RowValues {
let determine_idx = self.det_join_idx && self.join_column_idx.is_none();
let mut names = vec![];

let mut dicts = Vec::with_capacity(self.join_column_eval.len());
let mut ctxts = Vec::with_capacity(self.join_column_eval.len());
for phys_e in self.join_column_eval.iter() {
let s = phys_e.evaluate(chunk, &context.execution_state)?;
let mut s = s.to_physical_repr().rechunk();
Expand All @@ -60,7 +60,7 @@ impl RowValues {
names.push(s.name().to_string());
}
self.join_columns_material.push(s.array_ref(0).clone());
dicts.push(get_row_encoding_dictionary(s.dtype()));
ctxts.push(get_row_encoding_context(s.dtype()));
}

// We determine the indices of the columns that have to be removed
Expand All @@ -79,7 +79,7 @@ impl RowValues {
polars_row::convert_columns_amortized_no_order(
self.join_columns_material[0].len(), // @NOTE: does not work for ZFS
&self.join_columns_material,
&dicts,
&ctxts,
&mut self.current_rows,
);

Expand Down
8 changes: 4 additions & 4 deletions crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use polars_core::series::IsSorted;
use polars_row::decode::decode_rows_from_binary;
use polars_row::{RowEncodingContext, RowEncodingOptions};

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::*;
use crate::operators::{
DataChunk, FinalizedSink, PExecutionContext, Sink, SinkResult, Source, SourceResult,
Expand Down Expand Up @@ -137,11 +137,11 @@ impl SortSinkMultiple {
) -> PolarsResult<Self> {
let mut schema = (*output_schema).clone();

let sort_dicts = sort_idx
let sort_ctxts = sort_idx
.iter()
.map(|i| {
let (_, dtype) = schema.get_at_index(*i).unwrap();
get_row_encoding_dictionary(dtype)
get_row_encoding_context(dtype)
})
.collect::<Vec<_>>();

Expand Down Expand Up @@ -182,7 +182,7 @@ impl SortSinkMultiple {
sort_options,
sort_idx: Arc::from(sort_idx),
sort_opts: Arc::from(sort_fields),
sort_dicts: Arc::from(sort_dicts),
sort_dicts: Arc::from(sort_ctxts),
sort_dtypes,
sort_column: vec![],
output_schema,
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-python/src/series/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use pyo3::prelude::*;
use pyo3::types::PyBytes;
use pyo3::{IntoPyObjectExt, Python};

use self::row_encode::get_row_encoding_dictionary;
use self::row_encode::get_row_encoding_context;
use super::PySeries;
use crate::dataframe::PyDataFrame;
use crate::error::PyPolarsErr;
Expand Down Expand Up @@ -549,7 +549,7 @@ impl PySeries {

let dicts = dtypes
.iter()
.map(|(_, dtype)| get_row_encoding_dictionary(&dtype.0))
.map(|(_, dtype)| get_row_encoding_context(&dtype.0))
.collect::<Vec<_>>();

// Get the BinaryOffset array.
Expand Down

0 comments on commit 6218b18

Please sign in to comment.