-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: deprecate ValuesExec
in favour of MemoryExec
#14032
Changes from 2 commits
23afe86
2f19a54
eaaf223
388bc5d
dc1952a
577b47f
8be2ccc
8c3c84d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,14 +24,17 @@ use std::sync::Arc; | |
use std::task::{Context, Poll}; | ||
|
||
use super::{ | ||
common, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, | ||
RecordBatchStream, SendableRecordBatchStream, Statistics, | ||
common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, | ||
PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, | ||
Statistics, | ||
}; | ||
use crate::execution_plan::{Boundedness, EmissionType}; | ||
|
||
use arrow::datatypes::SchemaRef; | ||
use arrow::record_batch::RecordBatch; | ||
use datafusion_common::{internal_err, project_schema, Result}; | ||
use arrow_array::RecordBatchOptions; | ||
use arrow_schema::Schema; | ||
use datafusion_common::{internal_err, plan_err, project_schema, Result, ScalarValue}; | ||
use datafusion_execution::memory_pool::MemoryReservation; | ||
use datafusion_execution::TaskContext; | ||
use datafusion_physical_expr::equivalence::ProjectionMapping; | ||
|
@@ -174,6 +177,93 @@ impl MemoryExec { | |
}) | ||
} | ||
|
||
/// Create a new execution plan from a list of constant values (`ValuesExec`) | ||
pub fn try_new_as_values( | ||
schema: SchemaRef, | ||
data: Vec<Vec<Arc<dyn PhysicalExpr>>>, | ||
) -> Result<Self> { | ||
if data.is_empty() { | ||
return plan_err!("Values list cannot be empty"); | ||
} | ||
|
||
let n_row = data.len(); | ||
let n_col = schema.fields().len(); | ||
|
||
let placeholder_schema = Arc::new(Schema::empty()); | ||
jonathanc-n marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let placeholder_batch = RecordBatch::try_new_with_options( | ||
Arc::clone(&placeholder_schema), | ||
vec![], | ||
&RecordBatchOptions::new().with_row_count(Some(1)), | ||
)?; | ||
|
||
// Evaluate each column | ||
let arrays = (0..n_col) | ||
.map(|j| { | ||
(0..n_row) | ||
.map(|i| { | ||
let expr = &data[i][j]; | ||
let result = expr.evaluate(&placeholder_batch)?; | ||
|
||
match result { | ||
ColumnarValue::Scalar(scalar) => Ok(scalar), | ||
ColumnarValue::Array(array) if array.len() == 1 => { | ||
ScalarValue::try_from_array(&array, 0) | ||
} | ||
ColumnarValue::Array(_) => { | ||
plan_err!("Cannot have array values in a values list") | ||
} | ||
} | ||
}) | ||
.collect::<Result<Vec<_>>>() | ||
.and_then(ScalarValue::iter_to_array) | ||
}) | ||
.collect::<Result<Vec<_>>>()?; | ||
|
||
let batch = RecordBatch::try_new_with_options( | ||
Arc::clone(&schema), | ||
arrays, | ||
&RecordBatchOptions::new().with_row_count(Some(n_row)), | ||
)?; | ||
|
||
let partitions = vec![batch]; | ||
Self::try_new_from_batches(Arc::clone(&schema), partitions) | ||
} | ||
|
||
/// Create a new plan using the provided schema and batches. | ||
/// | ||
/// Errors if any of the batches don't match the provided schema, or if no | ||
/// batches are provided. | ||
pub fn try_new_from_batches( | ||
schema: SchemaRef, | ||
batches: Vec<RecordBatch>, | ||
) -> Result<Self> { | ||
if batches.is_empty() { | ||
return plan_err!("Values list cannot be empty"); | ||
} | ||
|
||
for batch in &batches { | ||
let batch_schema = batch.schema(); | ||
if batch_schema != schema { | ||
return plan_err!( | ||
"Batch has invalid schema. Expected: {}, got: {}", | ||
schema, | ||
batch_schema | ||
); | ||
} | ||
} | ||
|
||
let cache = Self::compute_properties_as_value(Arc::clone(&schema)); | ||
Ok(Self { | ||
partitions: vec![batches], | ||
schema: Arc::clone(&schema), | ||
projected_schema: Arc::clone(&schema), | ||
projection: None, | ||
sort_information: vec![], | ||
cache, | ||
show_sizes: true, | ||
}) | ||
} | ||
|
||
/// Set `show_sizes` to determine whether to display partition sizes | ||
pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { | ||
self.show_sizes = show_sizes; | ||
|
@@ -293,6 +383,15 @@ impl MemoryExec { | |
Boundedness::Bounded, | ||
) | ||
} | ||
|
||
fn compute_properties_as_value(schema: SchemaRef) -> PlanProperties { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this something we should do prior to merging the PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alamb Yes, just added it |
||
PlanProperties::new( | ||
EquivalenceProperties::new(schema), | ||
Partitioning::UnknownPartitioning(1), | ||
EmissionType::Incremental, | ||
Boundedness::Bounded, | ||
) | ||
} | ||
} | ||
|
||
/// Iterator over batches | ||
|
@@ -696,3 +795,96 @@ mod lazy_memory_tests { | |
Ok(()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use crate::expressions::lit; | ||
use crate::test::{self, make_partition}; | ||
|
||
use arrow_schema::{DataType, Field}; | ||
use datafusion_common::stats::{ColumnStatistics, Precision}; | ||
|
||
#[tokio::test] | ||
async fn values_empty_case() -> Result<()> { | ||
let schema = test::aggr_test_schema(); | ||
let empty = MemoryExec::try_new_as_values(schema, vec![]); | ||
assert!(empty.is_err()); | ||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn new_exec_with_batches() { | ||
let batch = make_partition(7); | ||
let schema = batch.schema(); | ||
let batches = vec![batch.clone(), batch]; | ||
let _exec = MemoryExec::try_new_from_batches(schema, batches).unwrap(); | ||
} | ||
|
||
#[test] | ||
fn new_exec_with_batches_empty() { | ||
let batch = make_partition(7); | ||
let schema = batch.schema(); | ||
let _ = MemoryExec::try_new_from_batches(schema, Vec::new()).unwrap_err(); | ||
} | ||
|
||
#[test] | ||
fn new_exec_with_batches_invalid_schema() { | ||
let batch = make_partition(7); | ||
let batches = vec![batch.clone(), batch]; | ||
|
||
let invalid_schema = Arc::new(Schema::new(vec![ | ||
Field::new("col0", DataType::UInt32, false), | ||
Field::new("col1", DataType::Utf8, false), | ||
])); | ||
let _ = MemoryExec::try_new_from_batches(invalid_schema, batches).unwrap_err(); | ||
} | ||
|
||
// Test issue: https://github.com/apache/datafusion/issues/8763 | ||
#[test] | ||
fn new_exec_with_non_nullable_schema() { | ||
let schema = Arc::new(Schema::new(vec![Field::new( | ||
"col0", | ||
DataType::UInt32, | ||
false, | ||
)])); | ||
let _ = MemoryExec::try_new_as_values(Arc::clone(&schema), vec![vec![lit(1u32)]]) | ||
.unwrap(); | ||
// Test that a null value is rejected | ||
let _ = MemoryExec::try_new_as_values( | ||
schema, | ||
vec![vec![lit(ScalarValue::UInt32(None))]], | ||
) | ||
.unwrap_err(); | ||
} | ||
|
||
#[test] | ||
fn values_stats_with_nulls_only() -> Result<()> { | ||
let data = vec![ | ||
vec![lit(ScalarValue::Null)], | ||
vec![lit(ScalarValue::Null)], | ||
vec![lit(ScalarValue::Null)], | ||
]; | ||
let rows = data.len(); | ||
let values = MemoryExec::try_new_as_values( | ||
Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])), | ||
data, | ||
)?; | ||
|
||
assert_eq!( | ||
values.statistics()?, | ||
Statistics { | ||
num_rows: Precision::Exact(rows), | ||
total_byte_size: Precision::Exact(8), // not important | ||
column_statistics: vec![ColumnStatistics { | ||
null_count: Precision::Exact(rows), // there are only nulls | ||
distinct_count: Precision::Absent, | ||
max_value: Precision::Absent, | ||
min_value: Precision::Absent, | ||
},], | ||
} | ||
); | ||
|
||
Ok(()) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this PR I think it would be good to port this code (in the physical planner) to use
MemoryExec
-- that way queries will run throughMemoryExec
and we will confidence thatValuesExec
can really be removedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alamb Should be fine now