From 1869363fe14ab91dd7a10a1e71e588df58930978 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 7 Mar 2023 10:37:51 +0300 Subject: [PATCH 01/55] first implementation and tests of timestamp subtraction --- datafusion/common/Cargo.toml | 1 + datafusion/common/src/scalar.rs | 709 ++++++++++++++++++++++++++++++-- 2 files changed, 684 insertions(+), 26 deletions(-) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 96367f0c1959..fa0d0c71a60c 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -47,4 +47,5 @@ num_cpus = "1.13.0" object_store = { version = "0.5.4", default-features = false, optional = true } parquet = { version = "34.0.0", default-features = false, optional = true } pyo3 = { version = "0.18.0", optional = true } +rand = "0.8.4" sqlparser = "0.30" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 2123281217ba..d9d7f3744b34 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -43,7 +43,9 @@ use arrow::{ DECIMAL128_MAX_PRECISION, }, }; -use chrono::{Datelike, Duration, NaiveDate, NaiveDateTime}; +use chrono::{ + DateTime, Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime, Timelike, +}; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part to arrow's [`Array`]. @@ -503,6 +505,29 @@ macro_rules! impl_op { (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => { primitive_op!(lhs, rhs, Int8, $OPERATION) } + ( + ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), + ) => Ok(ts_nanosec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + ( + ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), + ) => Ok(ts_microsec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + ( + ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), + ) => Ok(ts_millisec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + ( + ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), + ) => Ok(ts_sec_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs)?), + // Binary operations on arguments with different types: (ScalarValue::Date32(Some(days)), _) => { let value = date32_add(*days, $RHS, get_sign!($OPERATION))?; @@ -547,6 +572,272 @@ macro_rules! get_sign { }; } +// all timestamp variants are converted to nanosecond scale +#[inline] +fn ts_microsec_sub_to_interval( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + match (lhs_ts.checked_mul(1_000), rhs_ts.checked_mul(1_000)) { + (Some(lhs_ns), Some(rhs_ns)) => { + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) + } + (None, _) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {lhs_ts:?}" + ))), + (_, None) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {rhs_ts:?}" + ))), + } +} +#[inline] +fn ts_millisec_sub_to_interval( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + match (lhs_ts.checked_mul(1_000_000), rhs_ts.checked_mul(1_000_000)) { + (Some(lhs_ns), Some(rhs_ns)) => { + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) + } + (None, _) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {lhs_ts:?}" + ))), + (_, None) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {rhs_ts:?}" + ))), + } +} +#[inline] +fn ts_sec_sub_to_interval( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + match ( + lhs_ts.checked_mul(1_000_000_000), + rhs_ts.checked_mul(1_000_000_000), + ) { + (Some(lhs_ns), Some(rhs_ns)) => { + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) + } + (None, _) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {lhs_ts:?}" + ))), + (_, None) => Err(DataFusionError::NotImplemented(format!( + "overflow while conversion of {rhs_ts:?}" + ))), + } +} + +// Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. +// Interval variants are always consist of the same signed parts to handle comparison operations more wisely. +// For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) +fn ts_nanosec_sub_to_interval( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + // Conversion of integer and string-typed timestamps to NaiveDateTime objects + // Timezone offsets are added also if applicable. + let (naive_date_time2_unchecked, naive_date_time1_unchecked); + if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { + (naive_date_time2_unchecked, naive_date_time1_unchecked) = + integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)?; + } else { + (naive_date_time2_unchecked, naive_date_time1_unchecked) = + integer_to_naive_datetime(lhs_ts, rhs_ts)?; + } + + // Check whether we will find a negative interval or not + let (naive_date_time2, naive_date_time1, sign) = + find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); + + // Subtraction of datetimes. Details are inside the function. + let (mut months, mut months_residual) = + datetime_month_sub_with_rem(naive_date_time2, naive_date_time1)?; + + // Check whether we can return an IntervalYearMonth variant without losing information + match months_residual.num_nanoseconds() { + Some(value) => { + if value == 0 { + return Ok(ScalarValue::IntervalYearMonth(Some(sign * months))); + } + } + None => { + return Err(DataFusionError::NotImplemented(String::from( + "months_residual nanosec overflow", + ))) + } + } + + // If months_residual is negative, take one month from months and + // add it to months_residual to make it positive. + // To ensure the difference is positive all the time, we take the days + // of previous datetime's month. + if months_residual.num_nanoseconds() < Some(0) { + (months, months_residual) = + normalize_duration(&months, &months_residual, naive_date_time1)?; + } + + // Check whether we can return an IntervalDayTime variant without losing information + let months_residual_in_ns = months_residual.num_nanoseconds().unwrap(); + if months_residual_in_ns % 1_000_000 == 0 { + let delta_secs = naive_date_time2 + .signed_duration_since(naive_date_time1) + .num_milliseconds(); + + return Ok(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value( + sign * (delta_secs / 86_400_000) as i32, + sign * (delta_secs % 86_400_000) as i32, + ), + ))); + } + + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + sign * months, + sign * (months_residual_in_ns / 86_400_000_000_000) as i32, + sign as i64 * (months_residual_in_ns % 86_400_000_000_000), + ), + ))) +} +#[inline] +fn integer_to_naive_datetime( + lhs_ts_ns: &i64, + rhs_ts_ns: &i64, +) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { + match ( + NaiveDateTime::from_timestamp_opt( + lhs_ts_ns / 1_000_000_000, + (lhs_ts_ns % 1_000_000_000) as u32, + ), + NaiveDateTime::from_timestamp_opt( + rhs_ts_ns / 1_000_000_000, + (rhs_ts_ns % 1_000_000_000) as u32, + ), + ) { + (Some(x), Some(y)) => Ok((x, y)), + (x, y) => Err(DataFusionError::NotImplemented(format!( + "timestamps {x:?} or {y:?} cannot be converted to datetimes", + ))), + } +} +#[inline] +fn integer_w_timezone_to_naive_datetime( + lhs_ts_ns: &i64, + rhs_ts_ns: &i64, + lhs_tz: &String, + rhs_tz: &String, +) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { + let (naive_lhs, naive_rhs) = integer_to_naive_datetime(lhs_ts_ns, rhs_ts_ns)?; + + match (parse_tz_to_offset(lhs_tz), parse_tz_to_offset(rhs_tz)) { + (Some(l), Some(r)) => Ok(( + DateTime::::from_utc(naive_lhs, l).naive_local(), + DateTime::::from_utc(naive_rhs, r).naive_local(), + )), + (_, _) => Ok((naive_lhs, naive_rhs)), + } +} +// This function parses as the format of "+HH:MM", for example, "+05:30" +#[inline] +fn parse_tz_to_offset(tz: &String) -> Option { + let sign = tz.chars().next().unwrap(); + let hours = tz[1..3].parse::().unwrap(); + let minutes = tz[4..6].parse::().unwrap(); + let timezone_offset = match sign { + '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60).unwrap(), + '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60).unwrap(), + _ => panic!("Invalid timezone string: {}", tz), + }; + Some(timezone_offset) +} +#[inline] +fn find_interval_sign( + ndt2: NaiveDateTime, + ndt1: NaiveDateTime, +) -> (NaiveDateTime, NaiveDateTime, i32) { + let sign; + if ndt2.timestamp_nanos() < ndt1.timestamp_nanos() { + sign = -1; + (ndt1, ndt2, sign) + } else { + sign = 1; + (ndt2, ndt1, sign) + } +} +#[inline] +fn datetime_month_sub_with_rem( + date_time2: NaiveDateTime, + date_time1: NaiveDateTime, +) -> Result<(i32, Duration), DataFusionError> { + // The difference of total months. Since this operation ignores the days of dates, + // that month count may be decreased by 1 in case of negative day count. + let months = (date_time2.year() - date_time1.year()) * 12 + + (date_time2.month() as i32 - date_time1.month() as i32); + + // months_residual is in the form of X secs, Y nanosecs. + // Y cannot be larger than 1_000_000_000, it is rounded up to seconds. + // The subtractions may overflow, so cast i64. + let months_residual = + Duration::days(date_time2.day() as i64 - date_time1.day() as i64) + + Duration::hours(date_time2.hour() as i64 - date_time1.hour() as i64) + + Duration::minutes(date_time2.minute() as i64 - date_time1.minute() as i64) + + Duration::seconds(date_time2.second() as i64 - date_time1.second() as i64) + + Duration::nanoseconds( + date_time2.nanosecond() as i64 - date_time1.nanosecond() as i64, + ); + + Ok((months, months_residual)) +} +#[inline] +fn normalize_duration( + months: &i32, + months_residual: &Duration, + at_month: NaiveDateTime, +) -> Result<(i32, Duration), DataFusionError> { + // For example, if the previous datetime's month and date is (Feb, 15), + // when we add the days of that month to month_residual + // variable, we need to add the february's day count. + // To ensure the difference is positive all the time, we take the days + // of previous datetime's month. + let added_days = + &Duration::days(days_in_month(at_month.year(), at_month.month())?.into()); + let months_residual_new = match months_residual.checked_add(added_days) { + Some(value) => value, + None => { + return Err(DataFusionError::NotImplemented(format!( + "normalize duration error, cannot add {added_days:?} days to {months_residual:?}", + ))) + } + }; + let months_new = months - 1; + Ok((months_new, months_residual_new)) +} +#[inline] +// It gives the day count of the corresponding month at that year. +fn days_in_month(year: i32, month: u32) -> Result { + if let Some(first_day) = NaiveDate::from_ymd_opt(year, month, 1) { + let last_day = first_day + .with_month(month + 1) + .unwrap_or_else(|| NaiveDate::from_ymd_opt(year + 1, 1, 1).unwrap()) + .pred_opt(); + if let Some(days) = last_day { + return Ok(days.day()); + } + } + Err(DataFusionError::NotImplemented(format!( + "invalid date parameters, year: {year:?} & month: {month:?}", + ))) +} + #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); @@ -1032,8 +1323,8 @@ impl ScalarValue { DataType::UInt16 => ScalarValue::UInt16(Some(0)), DataType::UInt32 => ScalarValue::UInt32(Some(0)), DataType::UInt64 => ScalarValue::UInt64(Some(0)), - DataType::Float32 => ScalarValue::Float32(Some(0.0)), - DataType::Float64 => ScalarValue::Float64(Some(0.0)), + DataType::Float32 => ScalarValue::UInt64(Some(0)), + DataType::Float64 => ScalarValue::UInt64(Some(0)), _ => { return Err(DataFusionError::NotImplemented(format!( "Can't create a zero scalar from data_type \"{datatype:?}\"" @@ -1296,7 +1587,7 @@ impl ScalarValue { } macro_rules! build_array_primitive_tz { - ($ARRAY_TY:ident, $SCALAR_TY:ident, $TZ:expr) => {{ + ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ { let array = scalars.map(|sv| { if let ScalarValue::$SCALAR_TY(v, _) = sv { @@ -1310,7 +1601,7 @@ impl ScalarValue { } }) .collect::>()?; - Arc::new(array.with_timezone_opt($TZ.clone())) + Arc::new(array) } }}; } @@ -1444,29 +1735,17 @@ impl ScalarValue { DataType::Time64(TimeUnit::Nanosecond) => { build_array_primitive!(Time64NanosecondArray, Time64Nanosecond) } - DataType::Timestamp(TimeUnit::Second, tz) => { - build_array_primitive_tz!(TimestampSecondArray, TimestampSecond, tz) + DataType::Timestamp(TimeUnit::Second, _) => { + build_array_primitive_tz!(TimestampSecondArray, TimestampSecond) } - DataType::Timestamp(TimeUnit::Millisecond, tz) => { - build_array_primitive_tz!( - TimestampMillisecondArray, - TimestampMillisecond, - tz - ) + DataType::Timestamp(TimeUnit::Millisecond, _) => { + build_array_primitive_tz!(TimestampMillisecondArray, TimestampMillisecond) } - DataType::Timestamp(TimeUnit::Microsecond, tz) => { - build_array_primitive_tz!( - TimestampMicrosecondArray, - TimestampMicrosecond, - tz - ) + DataType::Timestamp(TimeUnit::Microsecond, _) => { + build_array_primitive_tz!(TimestampMicrosecondArray, TimestampMicrosecond) } - DataType::Timestamp(TimeUnit::Nanosecond, tz) => { - build_array_primitive_tz!( - TimestampNanosecondArray, - TimestampNanosecond, - tz - ) + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + build_array_primitive_tz!(TimestampNanosecondArray, TimestampNanosecond) } DataType::Interval(IntervalUnit::DayTime) => { build_array_primitive!(IntervalDayTimeArray, IntervalDayTime) @@ -2659,7 +2938,7 @@ impl TryFrom<&DataType> for ScalarValue { macro_rules! format_option { ($F:expr, $EXPR:expr) => {{ match $EXPR { - Some(e) => write!($F, "{e}"), + Some(e) => write!($F, "{}", e), None => write!($F, "NULL"), } }}; @@ -2887,6 +3166,7 @@ mod tests { use arrow::compute::kernels; use arrow::datatypes::ArrowPrimitiveType; + use rand::Rng; use crate::cast::{as_string_array, as_uint32_array, as_uint64_array}; use crate::from_slice::FromSlice; @@ -4430,4 +4710,381 @@ mod tests { assert!(distance.is_none()); } } + + #[test] + fn timestamp_op_tests() { + // positive interval, edge cases + let timestamps_next = new_timestamps_next(); + let timestamps_prev = new_timestamps_prev(); + let expected_results = new_expected_results(1); + for (idx, exp) in expected_results.iter().enumerate() { + assert_eq!( + *exp, + timestamps_next[idx].sub(×tamps_prev[idx]).unwrap() + ) + } + + // negative interval, edge cases + let timestamps_next = new_timestamps_prev(); + let timestamps_prev = new_timestamps_next(); + let expected_results = new_expected_results(-1); + for (idx, exp) in expected_results.iter().enumerate() { + assert_eq!( + *exp, + timestamps_next[idx].sub(×tamps_prev[idx]).unwrap() + ); + } + + // timestamp1 + (or -) interval = timestamp2 + // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? + let sample_size = 100000; + let timestamps1 = get_random_timestamps1(sample_size); + let intervals = get_random_intervals(sample_size); + // ts(sec) + interval(ns) = ts(sec); however, + // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, + // timestamps are more precise than intervals in tests. + let mut timestamp2: ScalarValue; + for (idx, ts1) in timestamps1.iter().enumerate() { + if idx % 2 == 0 { + timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); + println!( + "{:?}, {:?}, {:?}, {:?}", + idx, timestamp2, ts1, intervals[idx] + ); + assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); + } else { + timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); + println!( + "{:?}, {:?}, {:?}, {:?}", + idx, timestamp2, ts1, intervals[idx] + ); + assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); + }; + } + } + + fn new_timestamps_next() -> Vec { + vec![ + // ScalarValue::TimestampNanosecond(Some(1308158638939668236), None), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_nano_opt(1, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + Some("+01:00".to_string()), + ), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_micro_opt(2, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + Some("+01:00".to_string()), + ), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_milli_opt(10, 10, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("+10:10".to_string()), + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + Some("-11:59".to_string()), + ), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_milli_opt(23, 58, 0, 250) + .unwrap() + .timestamp_millis(), + ), + Some("+11:59".to_string()), + ), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 15) + .unwrap() + .timestamp_micros(), + ), + None, + ), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 22) + .unwrap() + .timestamp_nanos(), + ), + None, + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 12, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, + ), + ] + } + + fn new_timestamps_prev() -> Vec { + vec![ + // ScalarValue::TimestampNanosecond(Some(1171521569027710670), None), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + Some("+00:00".to_string()), + ), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + Some("-01:00".to_string()), + ), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_milli_opt(1, 0, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("+01:00".to_string()), + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_opt(23, 58, 0) + .unwrap() + .timestamp(), + ), + Some("+11:59".to_string()), + ), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_milli_opt(0, 0, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("-11:59".to_string()), + ), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + None, + ), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 31) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + None, + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2021, 12, 30) + .unwrap() + .and_hms_opt(0, 0, 30) + .unwrap() + .timestamp(), + ), + None, + ), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(1980, 11, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, + ), + ] + } + + fn new_expected_results(sign: i32) -> Vec { + vec![ + // ScalarValue::IntervalMonthDayNano(Some(4040636288743990090004520869950)), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, + sign * 2, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, + sign * 2, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, + sign * 2, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + sign * 250, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(sign * 2, 0, sign as i64 * 15_000), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(sign, sign, sign as i64 * 22), + )), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 425, + sign * 86370000, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + sign * 43, + sign, + ))), + ] + } + + fn get_random_timestamps1(sample_size: u64) -> Vec { + let vector_size = sample_size; + let mut timestamp = vec![]; + let mut rng = rand::thread_rng(); + for i in 0..vector_size { + let year = rng.gen_range(1995..=2050); + let month = rng.gen_range(1..=12); + let day = rng.gen_range(1..=28); + let hour = rng.gen_range(0..=23); + let minute = rng.gen_range(0..=59); + let second = rng.gen_range(0..=59); + if i % 4 == 0 { + timestamp.push(ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(year, month, day) + .unwrap() + .and_hms_opt(hour, minute, second) + .unwrap() + .timestamp(), + ), + None, + )) + } else if i % 4 == 1 { + let millisec = rng.gen_range(0..=999); + timestamp.push(ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(year, month, day) + .unwrap() + .and_hms_milli_opt(hour, minute, second, millisec) + .unwrap() + .timestamp_millis(), + ), + None, + )) + } else if i % 4 == 2 { + let microsec = rng.gen_range(0..=999_999); + timestamp.push(ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(year, month, day) + .unwrap() + .and_hms_micro_opt(hour, minute, second, microsec) + .unwrap() + .timestamp_micros(), + ), + None, + )) + } else if i % 4 == 3 { + let nanosec = rng.gen_range(0..=999_999_999); + timestamp.push(ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(year, month, day) + .unwrap() + .and_hms_nano_opt(hour, minute, second, nanosec) + .unwrap() + .timestamp_nanos(), + ), + None, + )) + } + } + timestamp + } + + fn get_random_intervals(sample_size: u64) -> Vec { + let vector_size = sample_size; + let mut intervals = vec![]; + let mut rng = rand::thread_rng(); + for i in 0..vector_size { + if i % 3 == 2 && i % 4 == 3 { + let month = rng.gen_range(0..=100); + // there is an test issue for the days 28(29). + // for example, if we have an expected interval 2 months 28(29) days, + // the subtractor finds it as 3 months if the previous timestamp + // is at february. + let day = rng.gen_range(0..=27); + let nanosec = rng.gen_range(0..86_400_000_000_000); + intervals.push(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(month, day, nanosec), + ))); + } else if i % 3 == 1 && i % 4 != 0 { + let day = rng.gen_range(0..=5000); + let millisec = rng.gen_range(0..86_400_000); + intervals.push(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value(day, millisec), + ))) + } else { + let year = rng.gen_range(0..=20); + let month = rng.gen_range(0..=50); + intervals.push(ScalarValue::IntervalYearMonth(Some( + IntervalYearMonthType::make_value(year, month), + ))) + } + } + intervals + } } From 2f0127832569f49cb3e1a2ed8e0a0fe7692cbc53 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 7 Mar 2023 15:18:19 +0300 Subject: [PATCH 02/55] improvement after review --- datafusion/common/src/scalar.rs | 122 ++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 53 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index d9d7f3744b34..c98b0ff28f6d 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -645,14 +645,12 @@ fn ts_nanosec_sub_to_interval( ) -> Result { // Conversion of integer and string-typed timestamps to NaiveDateTime objects // Timezone offsets are added also if applicable. - let (naive_date_time2_unchecked, naive_date_time1_unchecked); - if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { - (naive_date_time2_unchecked, naive_date_time1_unchecked) = - integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)?; - } else { - (naive_date_time2_unchecked, naive_date_time1_unchecked) = - integer_to_naive_datetime(lhs_ts, rhs_ts)?; - } + let (naive_date_time2_unchecked, naive_date_time1_unchecked) = + if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { + integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)? + } else { + integer_to_naive_datetime(lhs_ts, rhs_ts)? + }; // Check whether we will find a negative interval or not let (naive_date_time2, naive_date_time1, sign) = @@ -662,18 +660,13 @@ fn ts_nanosec_sub_to_interval( let (mut months, mut months_residual) = datetime_month_sub_with_rem(naive_date_time2, naive_date_time1)?; + let err = || { + DataFusionError::NotImplemented(String::from("months_residual nanosec overflow")) + }; // Check whether we can return an IntervalYearMonth variant without losing information - match months_residual.num_nanoseconds() { - Some(value) => { - if value == 0 { - return Ok(ScalarValue::IntervalYearMonth(Some(sign * months))); - } - } - None => { - return Err(DataFusionError::NotImplemented(String::from( - "months_residual nanosec overflow", - ))) - } + let value = months_residual.num_nanoseconds().ok_or_else(err)?; + if value == 0 { + return Ok(ScalarValue::IntervalYearMonth(Some(sign * months))); } // If months_residual is negative, take one month from months and @@ -686,12 +679,12 @@ fn ts_nanosec_sub_to_interval( } // Check whether we can return an IntervalDayTime variant without losing information - let months_residual_in_ns = months_residual.num_nanoseconds().unwrap(); + let months_residual_in_ns = months_residual.num_nanoseconds().ok_or_else(err)?; if months_residual_in_ns % 1_000_000 == 0 { let delta_secs = naive_date_time2 .signed_duration_since(naive_date_time1) .num_milliseconds(); - + // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day return Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( sign * (delta_secs / 86_400_000) as i32, @@ -700,6 +693,7 @@ fn ts_nanosec_sub_to_interval( ))); } + // 60 * 60 * 24 * 1000 * 1000 * 1000 = 86_400_000_000_000, number of nanosecs in a day Ok(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value( sign * months, @@ -739,7 +733,7 @@ fn integer_w_timezone_to_naive_datetime( let (naive_lhs, naive_rhs) = integer_to_naive_datetime(lhs_ts_ns, rhs_ts_ns)?; match (parse_tz_to_offset(lhs_tz), parse_tz_to_offset(rhs_tz)) { - (Some(l), Some(r)) => Ok(( + (Ok(l), Ok(r)) => Ok(( DateTime::::from_utc(naive_lhs, l).naive_local(), DateTime::::from_utc(naive_rhs, r).naive_local(), )), @@ -748,31 +742,39 @@ fn integer_w_timezone_to_naive_datetime( } // This function parses as the format of "+HH:MM", for example, "+05:30" #[inline] -fn parse_tz_to_offset(tz: &String) -> Option { - let sign = tz.chars().next().unwrap(); - let hours = tz[1..3].parse::().unwrap(); - let minutes = tz[4..6].parse::().unwrap(); +fn parse_tz_to_offset(tz: &String) -> Result { + let err_str = &String::from("error while parsing timezone"); + let err = || DataFusionError::NotImplemented(err_str.to_string()); + + let sign = tz.chars().next().ok_or_else(err)?; + let hours = tz[1..3] + .parse::() + .map_err(|_e| DataFusionError::NotImplemented(err_str.to_string()))?; + let minutes = tz[4..6] + .parse::() + .map_err(|_e| DataFusionError::NotImplemented(err_str.to_string()))?; let timezone_offset = match sign { - '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60).unwrap(), - '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60).unwrap(), - _ => panic!("Invalid timezone string: {}", tz), + '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, + '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, + _ => { + return Err(DataFusionError::NotImplemented(err_str.to_string())); + } }; - Some(timezone_offset) + Ok(timezone_offset) } #[inline] fn find_interval_sign( ndt2: NaiveDateTime, ndt1: NaiveDateTime, ) -> (NaiveDateTime, NaiveDateTime, i32) { - let sign; if ndt2.timestamp_nanos() < ndt1.timestamp_nanos() { - sign = -1; - (ndt1, ndt2, sign) + (ndt1, ndt2, -1) } else { - sign = 1; - (ndt2, ndt1, sign) + (ndt2, ndt1, 1) } } +// This function assumes 'date_time2' is greater than 'date_time1', +// therefore; resulting 'months' cannot be negative. #[inline] fn datetime_month_sub_with_rem( date_time2: NaiveDateTime, @@ -825,11 +827,13 @@ fn normalize_duration( // It gives the day count of the corresponding month at that year. fn days_in_month(year: i32, month: u32) -> Result { if let Some(first_day) = NaiveDate::from_ymd_opt(year, month, 1) { - let last_day = first_day - .with_month(month + 1) - .unwrap_or_else(|| NaiveDate::from_ymd_opt(year + 1, 1, 1).unwrap()) - .pred_opt(); - if let Some(days) = last_day { + let last_day = match first_day.with_month(month + 1) { + Some(day) => day, + None => NaiveDate::from_ymd_opt(year + 1, 1, 1).ok_or_else(|| { + DataFusionError::NotImplemented(format!("out-of-range year",)) + })?, + }; + if let Some(days) = last_day.pred_opt() { return Ok(days.day()); } } @@ -1323,8 +1327,8 @@ impl ScalarValue { DataType::UInt16 => ScalarValue::UInt16(Some(0)), DataType::UInt32 => ScalarValue::UInt32(Some(0)), DataType::UInt64 => ScalarValue::UInt64(Some(0)), - DataType::Float32 => ScalarValue::UInt64(Some(0)), - DataType::Float64 => ScalarValue::UInt64(Some(0)), + DataType::Float32 => ScalarValue::Float32(Some(0.0)), + DataType::Float64 => ScalarValue::Float64(Some(0.0)), _ => { return Err(DataFusionError::NotImplemented(format!( "Can't create a zero scalar from data_type \"{datatype:?}\"" @@ -1587,7 +1591,7 @@ impl ScalarValue { } macro_rules! build_array_primitive_tz { - ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ + ($ARRAY_TY:ident, $SCALAR_TY:ident, $TZ:expr) => {{ { let array = scalars.map(|sv| { if let ScalarValue::$SCALAR_TY(v, _) = sv { @@ -1601,7 +1605,7 @@ impl ScalarValue { } }) .collect::>()?; - Arc::new(array) + Arc::new(array.with_timezone_opt($TZ.clone())) } }}; } @@ -1735,17 +1739,29 @@ impl ScalarValue { DataType::Time64(TimeUnit::Nanosecond) => { build_array_primitive!(Time64NanosecondArray, Time64Nanosecond) } - DataType::Timestamp(TimeUnit::Second, _) => { - build_array_primitive_tz!(TimestampSecondArray, TimestampSecond) + DataType::Timestamp(TimeUnit::Second, tz) => { + build_array_primitive_tz!(TimestampSecondArray, TimestampSecond, tz) } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - build_array_primitive_tz!(TimestampMillisecondArray, TimestampMillisecond) + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + build_array_primitive_tz!( + TimestampMillisecondArray, + TimestampMillisecond, + tz + ) } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - build_array_primitive_tz!(TimestampMicrosecondArray, TimestampMicrosecond) + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + build_array_primitive_tz!( + TimestampMicrosecondArray, + TimestampMicrosecond, + tz + ) } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - build_array_primitive_tz!(TimestampNanosecondArray, TimestampNanosecond) + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + build_array_primitive_tz!( + TimestampNanosecondArray, + TimestampNanosecond, + tz + ) } DataType::Interval(IntervalUnit::DayTime) => { build_array_primitive!(IntervalDayTimeArray, IntervalDayTime) @@ -2938,7 +2954,7 @@ impl TryFrom<&DataType> for ScalarValue { macro_rules! format_option { ($F:expr, $EXPR:expr) => {{ match $EXPR { - Some(e) => write!($F, "{}", e), + Some(e) => write!($F, "{e}"), None => write!($F, "NULL"), } }}; From 806b4d3a4f9199ab8f69253fbb96e1bc2fb6712c Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 8 Mar 2023 13:25:25 +0300 Subject: [PATCH 03/55] postgre interval format option --- datafusion/common/src/scalar.rs | 184 +++++++++++++++++++++++++------- 1 file changed, 148 insertions(+), 36 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index c98b0ff28f6d..2e31b24811d4 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -634,10 +634,26 @@ fn ts_sec_sub_to_interval( } } +// This function will be removed once the result format is clarified. +fn ts_nanosec_sub_to_interval( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + let round_up_to_month = true; + + if round_up_to_month { + ts_nanosec_sub_to_interval_months(lhs_ts, rhs_ts, lhs_tz, rhs_tz) + } else { + ts_nanosec_sub_to_interval_days(lhs_ts, rhs_ts, lhs_tz, rhs_tz) + } +} + // Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. // Interval variants are always consist of the same signed parts to handle comparison operations more wisely. // For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) -fn ts_nanosec_sub_to_interval( +fn ts_nanosec_sub_to_interval_months( lhs_ts: &i64, rhs_ts: &i64, lhs_tz: &Option, @@ -702,6 +718,51 @@ fn ts_nanosec_sub_to_interval( ), ))) } + +// Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. +// Interval variants are always consist of the same signed parts to handle comparison operations more wisely. +// For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) +fn ts_nanosec_sub_to_interval_days( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, +) -> Result { + // Conversion of integer and string-typed timestamps to NaiveDateTime objects + // Timezone offsets are added also if applicable. + let (naive_date_time2_unchecked, naive_date_time1_unchecked) = + if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { + integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)? + } else { + integer_to_naive_datetime(lhs_ts, rhs_ts)? + }; + + // Check whether we will find a negative interval or not + let (naive_date_time2, naive_date_time1, sign) = + find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); + + // Subtraction of datetimes. Details are inside the function. + let duration_in_nanosec = datetime_day_sub(naive_date_time2, naive_date_time1)?; + + // Try to return in IntervalDayTime + if duration_in_nanosec % 1_000_000 == 0 { + return Ok(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value( + sign * (duration_in_nanosec / 86_400_000_000_000) as i32, + sign * ((duration_in_nanosec / 1_000_000) % 86_400_000) as i32, + ), + ))); + } + + // The last option IntervalMonthDayNano + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + 0, + sign * (duration_in_nanosec / 86_400_000_000_000) as i32, + sign as i64 * (duration_in_nanosec % 86_400_000_000_000), + ), + ))) +} #[inline] fn integer_to_naive_datetime( lhs_ts_ns: &i64, @@ -727,8 +788,8 @@ fn integer_to_naive_datetime( fn integer_w_timezone_to_naive_datetime( lhs_ts_ns: &i64, rhs_ts_ns: &i64, - lhs_tz: &String, - rhs_tz: &String, + lhs_tz: &str, + rhs_tz: &str, ) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { let (naive_lhs, naive_rhs) = integer_to_naive_datetime(lhs_ts_ns, rhs_ts_ns)?; @@ -742,7 +803,7 @@ fn integer_w_timezone_to_naive_datetime( } // This function parses as the format of "+HH:MM", for example, "+05:30" #[inline] -fn parse_tz_to_offset(tz: &String) -> Result { +fn parse_tz_to_offset(tz: &str) -> Result { let err_str = &String::from("error while parsing timezone"); let err = || DataFusionError::NotImplemented(err_str.to_string()); @@ -800,6 +861,21 @@ fn datetime_month_sub_with_rem( Ok((months, months_residual)) } #[inline] +// This function assumes 'date_time2' is greater than 'date_time1', +// therefore; the result cannot be negative. +fn datetime_day_sub( + date_time2: NaiveDateTime, + date_time1: NaiveDateTime, +) -> Result { + // We directly take the difference of datetimes in nanosecond precision. + date_time2 + .signed_duration_since(date_time1) + .num_nanoseconds() + .ok_or(DataFusionError::NotImplemented(String::from( + "datetime subtraction overflow", + ))) +} +#[inline] fn normalize_duration( months: &i32, months_residual: &Duration, @@ -830,7 +906,7 @@ fn days_in_month(year: i32, month: u32) -> Result { let last_day = match first_day.with_month(month + 1) { Some(day) => day, None => NaiveDate::from_ymd_opt(year + 1, 1, 1).ok_or_else(|| { - DataFusionError::NotImplemented(format!("out-of-range year",)) + DataFusionError::NotImplemented(format!("out of range year: 1+{year}")) })?, }; if let Some(days) = last_day.pred_opt() { @@ -4729,10 +4805,15 @@ mod tests { #[test] fn timestamp_op_tests() { + let round_up_to_month = true; // positive interval, edge cases let timestamps_next = new_timestamps_next(); let timestamps_prev = new_timestamps_prev(); - let expected_results = new_expected_results(1); + let expected_results = if round_up_to_month { + new_expected_results_months(1) + } else { + new_expected_results_days(1) + }; for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, @@ -4743,7 +4824,11 @@ mod tests { // negative interval, edge cases let timestamps_next = new_timestamps_prev(); let timestamps_prev = new_timestamps_next(); - let expected_results = new_expected_results(-1); + let expected_results = if round_up_to_month { + new_expected_results_months(-1) + } else { + new_expected_results_days(-1) + }; for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, @@ -4751,37 +4836,31 @@ mod tests { ); } - // timestamp1 + (or -) interval = timestamp2 - // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 100000; - let timestamps1 = get_random_timestamps1(sample_size); - let intervals = get_random_intervals(sample_size); - // ts(sec) + interval(ns) = ts(sec); however, - // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, - // timestamps are more precise than intervals in tests. - let mut timestamp2: ScalarValue; - for (idx, ts1) in timestamps1.iter().enumerate() { - if idx % 2 == 0 { - timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); - println!( - "{:?}, {:?}, {:?}, {:?}", - idx, timestamp2, ts1, intervals[idx] - ); - assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); - } else { - timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); - println!( - "{:?}, {:?}, {:?}, {:?}", - idx, timestamp2, ts1, intervals[idx] - ); - assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); - }; + // RANDOM-VALUED TESTS, these are not applicable for day format + if round_up_to_month { + // timestamp1 + (or -) interval = timestamp2 + // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? + let sample_size = 100000; + let timestamps1 = get_random_timestamps1(sample_size); + let intervals = get_random_intervals(sample_size); + // ts(sec) + interval(ns) = ts(sec); however, + // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, + // timestamps are more precise than intervals in tests. + let mut timestamp2: ScalarValue; + for (idx, ts1) in timestamps1.iter().enumerate() { + if idx % 2 == 0 { + timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); + assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); + } else { + timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); + assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); + }; + } } } fn new_timestamps_next() -> Vec { vec![ - // ScalarValue::TimestampNanosecond(Some(1308158638939668236), None), ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -4877,7 +4956,6 @@ mod tests { fn new_timestamps_prev() -> Vec { vec![ - // ScalarValue::TimestampNanosecond(Some(1171521569027710670), None), ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -4971,9 +5049,8 @@ mod tests { ] } - fn new_expected_results(sign: i32) -> Vec { + fn new_expected_results_months(sign: i32) -> Vec { vec![ - // ScalarValue::IntervalMonthDayNano(Some(4040636288743990090004520869950)), ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))), ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( 0, @@ -5007,6 +5084,41 @@ mod tests { ))), ] } + fn new_expected_results_days(sign: i32) -> Vec { + vec![ + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + 0, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + 0, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + 0, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + sign * 250, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, sign * 59, sign as i64 * 15_000), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), + )), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 425, + sign * 86370000, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 15735, + 0, + ))), + ] + } fn get_random_timestamps1(sample_size: u64) -> Vec { let vector_size = sample_size; From 708d7179d2d258b420de4f3c5e6c181e67f05d38 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 8 Mar 2023 14:41:00 +0300 Subject: [PATCH 04/55] random tests extended --- datafusion-cli/Cargo.lock | 89 +++++++++++++++++---------------- datafusion/common/src/scalar.rs | 77 ++++++++++++++++++++-------- 2 files changed, 101 insertions(+), 65 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 78847f39290d..02fc00c8c4cf 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -293,9 +293,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.64" +version = "0.1.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" +checksum = "b84f9ebcc6c1f5b8cb160f6990096a5c127f423fcb6e1ccc46c370cbdfb75dfc" dependencies = [ "proc-macro2", "quote", @@ -592,9 +592,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" +checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" dependencies = [ "csv-core", "itoa", @@ -613,9 +613,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" +checksum = "9a140f260e6f3f79013b8bfc65e7ce630c9ab4388c6a89c71e07226f49487b72" dependencies = [ "cc", "cxxbridge-flags", @@ -625,9 +625,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" +checksum = "da6383f459341ea689374bf0a42979739dc421874f112ff26f829b8040b8e613" dependencies = [ "cc", "codespan-reporting", @@ -640,15 +640,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" +checksum = "90201c1a650e95ccff1c8c0bb5a343213bdd317c6e600a93075bca2eff54ec97" [[package]] name = "cxxbridge-macro" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" +checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56" dependencies = [ "proc-macro2", "quote", @@ -742,6 +742,7 @@ dependencies = [ "num_cpus", "object_store", "parquet", + "rand", "sqlparser", ] @@ -1342,9 +1343,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "io-lifetimes" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +checksum = "cfa919a82ea574332e2de6e74b4c36e74d41982b335080fa59d4ef31be20fdf3" dependencies = [ "libc", "windows-sys 0.45.0", @@ -1367,9 +1368,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "jobserver" @@ -1812,9 +1813,9 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" +checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "percent-encoding" @@ -1824,9 +1825,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", "indexmap", @@ -2058,9 +2059,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.8" +version = "0.36.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" +checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" dependencies = [ "bitflags", "errno", @@ -2093,9 +2094,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "rustyline" @@ -2122,9 +2123,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "same-file" @@ -2143,9 +2144,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" [[package]] name = "sct" @@ -2165,24 +2166,24 @@ checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" [[package]] name = "seq-macro" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1685deded9b272198423bdbdb907d8519def2f26cf3699040e54e8c4fbd5c5ce" +checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.152" +version = "1.0.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "3a382c72b4ba118526e187430bb4963cd6d55051ebf13d9b25574d379cc98d20" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "1ef476a5790f0f6decbc66726b6e5d63680ed518283e64c7df415989d880954f" dependencies = [ "proc-macro2", "quote", @@ -2191,9 +2192,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cad406b69c91885b5107daf2c29572f6c8cdb3c66826821e286c533490c0bc76" +checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" dependencies = [ "itoa", "ryu", @@ -2268,9 +2269,9 @@ checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" [[package]] name = "socket2" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" dependencies = [ "libc", "winapi", @@ -2387,18 +2388,18 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" dependencies = [ "proc-macro2", "quote", @@ -2574,9 +2575,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 2e31b24811d4..0e69f02f00f0 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -4836,26 +4836,29 @@ mod tests { ); } - // RANDOM-VALUED TESTS, these are not applicable for day format - if round_up_to_month { - // timestamp1 + (or -) interval = timestamp2 - // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 100000; - let timestamps1 = get_random_timestamps1(sample_size); - let intervals = get_random_intervals(sample_size); - // ts(sec) + interval(ns) = ts(sec); however, - // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, - // timestamps are more precise than intervals in tests. - let mut timestamp2: ScalarValue; - for (idx, ts1) in timestamps1.iter().enumerate() { - if idx % 2 == 0 { - timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); - assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); - } else { - timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); - assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); - }; - } + // RANDOM-VALUED TESTS + + // timestamp1 + (or -) interval = timestamp2 + // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? + let sample_size = 100000; + let timestamps1 = get_random_timestamps1(sample_size); + let intervals = if round_up_to_month { + get_random_intervals_months(sample_size) + } else { + get_random_intervals_days(sample_size) + }; + // ts(sec) + interval(ns) = ts(sec); however, + // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, + // timestamps are more precise than intervals in tests. + let mut timestamp2: ScalarValue; + for (idx, ts1) in timestamps1.iter().enumerate() { + if idx % 2 == 0 { + timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); + assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); + } else { + timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); + assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); + }; } } @@ -5183,7 +5186,7 @@ mod tests { timestamp } - fn get_random_intervals(sample_size: u64) -> Vec { + fn get_random_intervals_months(sample_size: u64) -> Vec { let vector_size = sample_size; let mut intervals = vec![]; let mut rng = rand::thread_rng(); @@ -5215,4 +5218,36 @@ mod tests { } intervals } + fn get_random_intervals_days(sample_size: u64) -> Vec { + let vector_size = sample_size; + let mut intervals = vec![]; + let mut rng = rand::thread_rng(); + for i in 0..vector_size { + if i % 4 == 0 { + let days = rng.gen_range(0..=1000); + intervals.push(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value(days, 0), + ))) + } else if i % 4 == 1 { + let days = rng.gen_range(0..=1000); + let millis = rng.gen_range(0..=86_400_000); + intervals.push(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value(days, millis), + ))) + } else if i % 4 == 2 { + let days = rng.gen_range(0..=1000); + let millis = rng.gen_range(0..=86_400_000); + intervals.push(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value(days, millis), + ))) + } else { + let days = rng.gen_range(0..=1000); + let nanosecs = rng.gen_range(1..86_400_000_000_000); + intervals.push(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, days, nanosecs), + ))); + } + } + intervals + } } From c5bacbe2612eb5c3d91cceba30fde39b2b2d05d4 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 8 Mar 2023 14:57:38 +0300 Subject: [PATCH 05/55] corrections after review --- datafusion/common/src/scalar.rs | 135 ++++++++++++++------------------ 1 file changed, 59 insertions(+), 76 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 0e69f02f00f0..c4a3e73d8728 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -580,17 +580,11 @@ fn ts_microsec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - match (lhs_ts.checked_mul(1_000), rhs_ts.checked_mul(1_000)) { - (Some(lhs_ns), Some(rhs_ns)) => { - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) - } - (None, _) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {lhs_ts:?}" - ))), - (_, None) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {rhs_ts:?}" - ))), - } + let err_msg = "Overflow while conversion to microseconds"; + let err = || DataFusionError::Execution(err_msg.to_string()); + let lhs_ns = lhs_ts.checked_mul(1_000).ok_or_else(err)?; + let rhs_ns = rhs_ts.checked_mul(1_000).ok_or_else(err)?; + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) } #[inline] fn ts_millisec_sub_to_interval( @@ -599,17 +593,11 @@ fn ts_millisec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - match (lhs_ts.checked_mul(1_000_000), rhs_ts.checked_mul(1_000_000)) { - (Some(lhs_ns), Some(rhs_ns)) => { - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) - } - (None, _) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {lhs_ts:?}" - ))), - (_, None) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {rhs_ts:?}" - ))), - } + let err_msg = "Overflow while conversion to microseconds"; + let err = || DataFusionError::Execution(err_msg.to_string()); + let lhs_ns = lhs_ts.checked_mul(1_000_000).ok_or_else(err)?; + let rhs_ns = rhs_ts.checked_mul(1_000_000).ok_or_else(err)?; + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) } #[inline] fn ts_sec_sub_to_interval( @@ -618,20 +606,11 @@ fn ts_sec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - match ( - lhs_ts.checked_mul(1_000_000_000), - rhs_ts.checked_mul(1_000_000_000), - ) { - (Some(lhs_ns), Some(rhs_ns)) => { - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) - } - (None, _) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {lhs_ts:?}" - ))), - (_, None) => Err(DataFusionError::NotImplemented(format!( - "overflow while conversion of {rhs_ts:?}" - ))), - } + let err_msg = "Overflow while conversion to microseconds"; + let err = || DataFusionError::Execution(err_msg.to_string()); + let lhs_ns = lhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; + let rhs_ns = rhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; + ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) } // This function will be removed once the result format is clarified. @@ -641,7 +620,7 @@ fn ts_nanosec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let round_up_to_month = true; + let round_up_to_month = false; if round_up_to_month { ts_nanosec_sub_to_interval_months(lhs_ts, rhs_ts, lhs_tz, rhs_tz) @@ -676,9 +655,8 @@ fn ts_nanosec_sub_to_interval_months( let (mut months, mut months_residual) = datetime_month_sub_with_rem(naive_date_time2, naive_date_time1)?; - let err = || { - DataFusionError::NotImplemented(String::from("months_residual nanosec overflow")) - }; + let err = + || DataFusionError::Execution(String::from("months_residual nanosec overflow")); // Check whether we can return an IntervalYearMonth variant without losing information let value = months_residual.num_nanoseconds().ok_or_else(err)?; if value == 0 { @@ -779,7 +757,7 @@ fn integer_to_naive_datetime( ), ) { (Some(x), Some(y)) => Ok((x, y)), - (x, y) => Err(DataFusionError::NotImplemented(format!( + (x, y) => Err(DataFusionError::Execution(format!( "timestamps {x:?} or {y:?} cannot be converted to datetimes", ))), } @@ -805,20 +783,20 @@ fn integer_w_timezone_to_naive_datetime( #[inline] fn parse_tz_to_offset(tz: &str) -> Result { let err_str = &String::from("error while parsing timezone"); - let err = || DataFusionError::NotImplemented(err_str.to_string()); + let err = || DataFusionError::Execution(err_str.to_string()); let sign = tz.chars().next().ok_or_else(err)?; let hours = tz[1..3] .parse::() - .map_err(|_e| DataFusionError::NotImplemented(err_str.to_string()))?; + .map_err(|_e| DataFusionError::Execution(err_str.to_string()))?; let minutes = tz[4..6] .parse::() - .map_err(|_e| DataFusionError::NotImplemented(err_str.to_string()))?; + .map_err(|_e| DataFusionError::Execution(err_str.to_string()))?; let timezone_offset = match sign { '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, _ => { - return Err(DataFusionError::NotImplemented(err_str.to_string())); + return Err(DataFusionError::Execution(err_str.to_string())); } }; Ok(timezone_offset) @@ -871,7 +849,7 @@ fn datetime_day_sub( date_time2 .signed_duration_since(date_time1) .num_nanoseconds() - .ok_or(DataFusionError::NotImplemented(String::from( + .ok_or(DataFusionError::Execution(String::from( "datetime subtraction overflow", ))) } @@ -891,7 +869,7 @@ fn normalize_duration( let months_residual_new = match months_residual.checked_add(added_days) { Some(value) => value, None => { - return Err(DataFusionError::NotImplemented(format!( + return Err(DataFusionError::Execution(format!( "normalize duration error, cannot add {added_days:?} days to {months_residual:?}", ))) } @@ -906,14 +884,14 @@ fn days_in_month(year: i32, month: u32) -> Result { let last_day = match first_day.with_month(month + 1) { Some(day) => day, None => NaiveDate::from_ymd_opt(year + 1, 1, 1).ok_or_else(|| { - DataFusionError::NotImplemented(format!("out of range year: 1+{year}")) + DataFusionError::Execution(format!("out of range year: 1+{year}")) })?, }; if let Some(days) = last_day.pred_opt() { return Ok(days.day()); } } - Err(DataFusionError::NotImplemented(format!( + Err(DataFusionError::Execution(format!( "invalid date parameters, year: {year:?} & month: {month:?}", ))) } @@ -4805,39 +4783,44 @@ mod tests { #[test] fn timestamp_op_tests() { - let round_up_to_month = true; + let round_up_to_month = false; // positive interval, edge cases - let timestamps_next = new_timestamps_next(); - let timestamps_prev = new_timestamps_prev(); + let vec_timestamps_next = timestamps_next(); + let vec_timestamps_prev = timestamps_prev(); let expected_results = if round_up_to_month { - new_expected_results_months(1) + expected_results_months(1) } else { - new_expected_results_days(1) + expected_results_days(1) }; for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, - timestamps_next[idx].sub(×tamps_prev[idx]).unwrap() + vec_timestamps_next[idx] + .sub(&vec_timestamps_prev[idx]) + .unwrap() ) } // negative interval, edge cases - let timestamps_next = new_timestamps_prev(); - let timestamps_prev = new_timestamps_next(); + let vec_timestamps_next = timestamps_prev(); + let vec_timestamps_prev = timestamps_next(); let expected_results = if round_up_to_month { - new_expected_results_months(-1) + expected_results_months(-1) } else { - new_expected_results_days(-1) + expected_results_days(-1) }; for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, - timestamps_next[idx].sub(×tamps_prev[idx]).unwrap() + vec_timestamps_next[idx] + .sub(&vec_timestamps_prev[idx]) + .unwrap() ); } - - // RANDOM-VALUED TESTS - + } + #[test] + fn timestamp_op_random_tests() { + let round_up_to_month = false; // timestamp1 + (or -) interval = timestamp2 // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? let sample_size = 100000; @@ -4862,7 +4845,7 @@ mod tests { } } - fn new_timestamps_next() -> Vec { + fn timestamps_next() -> Vec { vec![ ScalarValue::TimestampNanosecond( Some( @@ -4957,7 +4940,7 @@ mod tests { ] } - fn new_timestamps_prev() -> Vec { + fn timestamps_prev() -> Vec { vec![ ScalarValue::TimestampNanosecond( Some( @@ -5052,7 +5035,7 @@ mod tests { ] } - fn new_expected_results_months(sign: i32) -> Vec { + fn expected_results_months(sign: i32) -> Vec { vec![ ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))), ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( @@ -5087,7 +5070,7 @@ mod tests { ))), ] } - fn new_expected_results_days(sign: i32) -> Vec { + fn expected_results_days(sign: i32) -> Vec { vec![ ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( @@ -5192,8 +5175,8 @@ mod tests { let mut rng = rand::thread_rng(); for i in 0..vector_size { if i % 3 == 2 && i % 4 == 3 { - let month = rng.gen_range(0..=100); - // there is an test issue for the days 28(29). + let month = rng.gen_range(0..100); + // there is a complex test issue for the days 28(29). // for example, if we have an expected interval 2 months 28(29) days, // the subtractor finds it as 3 months if the previous timestamp // is at february. @@ -5203,14 +5186,14 @@ mod tests { IntervalMonthDayNanoType::make_value(month, day, nanosec), ))); } else if i % 3 == 1 && i % 4 != 0 { - let day = rng.gen_range(0..=5000); + let day = rng.gen_range(0..5000); let millisec = rng.gen_range(0..86_400_000); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(day, millisec), ))) } else { - let year = rng.gen_range(0..=20); - let month = rng.gen_range(0..=50); + let year = rng.gen_range(0..20); + let month = rng.gen_range(0..50); intervals.push(ScalarValue::IntervalYearMonth(Some( IntervalYearMonthType::make_value(year, month), ))) @@ -5224,24 +5207,24 @@ mod tests { let mut rng = rand::thread_rng(); for i in 0..vector_size { if i % 4 == 0 { - let days = rng.gen_range(0..=1000); + let days = rng.gen_range(0..1000); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, 0), ))) } else if i % 4 == 1 { - let days = rng.gen_range(0..=1000); + let days = rng.gen_range(0..1000); let millis = rng.gen_range(0..=86_400_000); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millis), ))) } else if i % 4 == 2 { - let days = rng.gen_range(0..=1000); + let days = rng.gen_range(0..1000); let millis = rng.gen_range(0..=86_400_000); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millis), ))) } else { - let days = rng.gen_range(0..=1000); + let days = rng.gen_range(0..1000); let nanosecs = rng.gen_range(1..86_400_000_000_000); intervals.push(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value(0, days, nanosecs), From 011933f141905a26d2100ec12bb86f8d2ada96f0 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 8 Mar 2023 19:16:29 +0300 Subject: [PATCH 06/55] operator check --- datafusion/common/src/scalar.rs | 53 ++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index c4a3e73d8728..f136785dad6b 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -508,26 +508,57 @@ macro_rules! impl_op { ( ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), - ) => Ok(ts_nanosec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + ) => match get_sign!($OPERATION) { + -1 => Ok(ts_nanosec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + _ => Err(DataFusionError::Internal(format!( + "Operator {} is not implemented for types {:?} and {:?}", + stringify!($OPERATION), + $LHS, + $RHS + ))), + }, ( ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), - ) => Ok(ts_microsec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + ) => match get_sign!($OPERATION) { + -1 => Ok(ts_microsec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + _ => Err(DataFusionError::Internal(format!( + "Operator {} is not implemented for types {:?} and {:?}", + stringify!($OPERATION), + $LHS, + $RHS + ))), + }, ( ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), - ) => Ok(ts_millisec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + ) => match get_sign!($OPERATION) { + -1 => Ok(ts_millisec_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, + )?), + _ => Err(DataFusionError::Internal(format!( + "Operator {} is not implemented for types {:?} and {:?}", + stringify!($OPERATION), + $LHS, + $RHS + ))), + }, ( ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), - ) => Ok(ts_sec_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs)?), - + ) => match get_sign!($OPERATION) { + -1 => Ok(ts_sec_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs)?), + _ => Err(DataFusionError::Internal(format!( + "Operator {} is not implemented for types {:?} and {:?}", + stringify!($OPERATION), + $LHS, + $RHS + ))), + }, // Binary operations on arguments with different types: (ScalarValue::Date32(Some(days)), _) => { let value = date32_add(*days, $RHS, get_sign!($OPERATION))?; From e475f587dc9875643e6a398c00878714bbd105d2 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 Mar 2023 13:10:41 +0300 Subject: [PATCH 07/55] flag is removed --- datafusion/common/src/scalar.rs | 369 +++++++++----------------------- 1 file changed, 106 insertions(+), 263 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index f136785dad6b..e6f66e15e756 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -611,7 +611,7 @@ fn ts_microsec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let err_msg = "Overflow while conversion to microseconds"; + let err_msg = "Overflow while conversion from microsecond to nanoseconds"; let err = || DataFusionError::Execution(err_msg.to_string()); let lhs_ns = lhs_ts.checked_mul(1_000).ok_or_else(err)?; let rhs_ns = rhs_ts.checked_mul(1_000).ok_or_else(err)?; @@ -624,7 +624,7 @@ fn ts_millisec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let err_msg = "Overflow while conversion to microseconds"; + let err_msg = "Overflow while conversion from millisecond to nanoseconds"; let err = || DataFusionError::Execution(err_msg.to_string()); let lhs_ns = lhs_ts.checked_mul(1_000_000).ok_or_else(err)?; let rhs_ns = rhs_ts.checked_mul(1_000_000).ok_or_else(err)?; @@ -637,38 +637,28 @@ fn ts_sec_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let err_msg = "Overflow while conversion to microseconds"; + let err_msg = "Overflow while conversion from second to nanoseconds"; let err = || DataFusionError::Execution(err_msg.to_string()); let lhs_ns = lhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; let rhs_ns = rhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) } -// This function will be removed once the result format is clarified. -fn ts_nanosec_sub_to_interval( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, -) -> Result { - let round_up_to_month = false; - - if round_up_to_month { - ts_nanosec_sub_to_interval_months(lhs_ts, rhs_ts, lhs_tz, rhs_tz) - } else { - ts_nanosec_sub_to_interval_days(lhs_ts, rhs_ts, lhs_tz, rhs_tz) - } -} - // Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. // Interval variants are always consist of the same signed parts to handle comparison operations more wisely. // For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) -fn ts_nanosec_sub_to_interval_months( +// In month-day-nano format, month bits are always 0, the result is shown in days as the largest scale. +fn ts_nanosec_sub_to_interval( lhs_ts: &i64, rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, ) -> Result { + let err = || { + DataFusionError::Execution(String::from( + "nanosec overflow in timestamp subtractÅŸon", + )) + }; // Conversion of integer and string-typed timestamps to NaiveDateTime objects // Timezone offsets are added also if applicable. let (naive_date_time2_unchecked, naive_date_time1_unchecked) = @@ -678,100 +668,45 @@ fn ts_nanosec_sub_to_interval_months( integer_to_naive_datetime(lhs_ts, rhs_ts)? }; - // Check whether we will find a negative interval or not + // Check whether we will find a negative interval or not. let (naive_date_time2, naive_date_time1, sign) = find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); // Subtraction of datetimes. Details are inside the function. - let (mut months, mut months_residual) = + let (months, months_residual) = datetime_month_sub_with_rem(naive_date_time2, naive_date_time1)?; - - let err = - || DataFusionError::Execution(String::from("months_residual nanosec overflow")); - // Check whether we can return an IntervalYearMonth variant without losing information - let value = months_residual.num_nanoseconds().ok_or_else(err)?; - if value == 0 { + // Check whether we can return an IntervalYearMonth variant without losing information. + let months_residual_in_ns = months_residual.num_nanoseconds().ok_or_else(err)?; + if months_residual_in_ns == 0 { return Ok(ScalarValue::IntervalYearMonth(Some(sign * months))); } - // If months_residual is negative, take one month from months and - // add it to months_residual to make it positive. - // To ensure the difference is positive all the time, we take the days - // of previous datetime's month. - if months_residual.num_nanoseconds() < Some(0) { - (months, months_residual) = - normalize_duration(&months, &months_residual, naive_date_time1)?; - } - // Check whether we can return an IntervalDayTime variant without losing information - let months_residual_in_ns = months_residual.num_nanoseconds().ok_or_else(err)?; + let delta_secs = naive_date_time2.signed_duration_since(naive_date_time1); if months_residual_in_ns % 1_000_000 == 0 { - let delta_secs = naive_date_time2 - .signed_duration_since(naive_date_time1) - .num_milliseconds(); // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day + let as_millisec = delta_secs.num_milliseconds(); return Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( - sign * (delta_secs / 86_400_000) as i32, - sign * (delta_secs % 86_400_000) as i32, + sign * (as_millisec / 86_400_000) as i32, + sign * (as_millisec % 86_400_000) as i32, ), ))); - } - - // 60 * 60 * 24 * 1000 * 1000 * 1000 = 86_400_000_000_000, number of nanosecs in a day - Ok(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value( - sign * months, - sign * (months_residual_in_ns / 86_400_000_000_000) as i32, - sign as i64 * (months_residual_in_ns % 86_400_000_000_000), - ), - ))) -} - -// Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. -// Interval variants are always consist of the same signed parts to handle comparison operations more wisely. -// For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) -fn ts_nanosec_sub_to_interval_days( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, -) -> Result { - // Conversion of integer and string-typed timestamps to NaiveDateTime objects - // Timezone offsets are added also if applicable. - let (naive_date_time2_unchecked, naive_date_time1_unchecked) = - if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { - integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)? - } else { - integer_to_naive_datetime(lhs_ts, rhs_ts)? - }; - - // Check whether we will find a negative interval or not - let (naive_date_time2, naive_date_time1, sign) = - find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); - - // Subtraction of datetimes. Details are inside the function. - let duration_in_nanosec = datetime_day_sub(naive_date_time2, naive_date_time1)?; - - // Try to return in IntervalDayTime - if duration_in_nanosec % 1_000_000 == 0 { - return Ok(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value( - sign * (duration_in_nanosec / 86_400_000_000_000) as i32, - sign * ((duration_in_nanosec / 1_000_000) % 86_400_000) as i32, + } else { + // 60 * 60 * 24 * 1000 * 1000 * 1000 = 86_400_000_000_000, number of nanosecs in a day + // To show similar behaviour with Postgre, we do not use month field, and collect + // months in the day field. + let as_nanosec = delta_secs.num_nanoseconds().ok_or_else(err)?; + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + 0, + sign * (as_nanosec / 86_400_000_000_000) as i32, + sign as i64 * (as_nanosec % 86_400_000_000_000), ), - ))); + ))) } - - // The last option IntervalMonthDayNano - Ok(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value( - 0, - sign * (duration_in_nanosec / 86_400_000_000_000) as i32, - sign as i64 * (duration_in_nanosec % 86_400_000_000_000), - ), - ))) } + #[inline] fn integer_to_naive_datetime( lhs_ts_ns: &i64, @@ -850,8 +785,7 @@ fn datetime_month_sub_with_rem( date_time2: NaiveDateTime, date_time1: NaiveDateTime, ) -> Result<(i32, Duration), DataFusionError> { - // The difference of total months. Since this operation ignores the days of dates, - // that month count may be decreased by 1 in case of negative day count. + // The difference of total months. let months = (date_time2.year() - date_time1.year()) * 12 + (date_time2.month() as i32 - date_time1.month() as i32); @@ -869,63 +803,6 @@ fn datetime_month_sub_with_rem( Ok((months, months_residual)) } -#[inline] -// This function assumes 'date_time2' is greater than 'date_time1', -// therefore; the result cannot be negative. -fn datetime_day_sub( - date_time2: NaiveDateTime, - date_time1: NaiveDateTime, -) -> Result { - // We directly take the difference of datetimes in nanosecond precision. - date_time2 - .signed_duration_since(date_time1) - .num_nanoseconds() - .ok_or(DataFusionError::Execution(String::from( - "datetime subtraction overflow", - ))) -} -#[inline] -fn normalize_duration( - months: &i32, - months_residual: &Duration, - at_month: NaiveDateTime, -) -> Result<(i32, Duration), DataFusionError> { - // For example, if the previous datetime's month and date is (Feb, 15), - // when we add the days of that month to month_residual - // variable, we need to add the february's day count. - // To ensure the difference is positive all the time, we take the days - // of previous datetime's month. - let added_days = - &Duration::days(days_in_month(at_month.year(), at_month.month())?.into()); - let months_residual_new = match months_residual.checked_add(added_days) { - Some(value) => value, - None => { - return Err(DataFusionError::Execution(format!( - "normalize duration error, cannot add {added_days:?} days to {months_residual:?}", - ))) - } - }; - let months_new = months - 1; - Ok((months_new, months_residual_new)) -} -#[inline] -// It gives the day count of the corresponding month at that year. -fn days_in_month(year: i32, month: u32) -> Result { - if let Some(first_day) = NaiveDate::from_ymd_opt(year, month, 1) { - let last_day = match first_day.with_month(month + 1) { - Some(day) => day, - None => NaiveDate::from_ymd_opt(year + 1, 1, 1).ok_or_else(|| { - DataFusionError::Execution(format!("out of range year: 1+{year}")) - })?, - }; - if let Some(days) = last_day.pred_opt() { - return Ok(days.day()); - } - } - Err(DataFusionError::Execution(format!( - "invalid date parameters, year: {year:?} & month: {month:?}", - ))) -} #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { @@ -4814,15 +4691,11 @@ mod tests { #[test] fn timestamp_op_tests() { - let round_up_to_month = false; // positive interval, edge cases let vec_timestamps_next = timestamps_next(); let vec_timestamps_prev = timestamps_prev(); - let expected_results = if round_up_to_month { - expected_results_months(1) - } else { - expected_results_days(1) - }; + let expected_results = get_expected_results(1); + for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, @@ -4835,11 +4708,7 @@ mod tests { // negative interval, edge cases let vec_timestamps_next = timestamps_prev(); let vec_timestamps_prev = timestamps_next(); - let expected_results = if round_up_to_month { - expected_results_months(-1) - } else { - expected_results_days(-1) - }; + let expected_results = get_expected_results(-1); for (idx, exp) in expected_results.iter().enumerate() { assert_eq!( *exp, @@ -4851,16 +4720,11 @@ mod tests { } #[test] fn timestamp_op_random_tests() { - let round_up_to_month = false; // timestamp1 + (or -) interval = timestamp2 // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 100000; - let timestamps1 = get_random_timestamps1(sample_size); - let intervals = if round_up_to_month { - get_random_intervals_months(sample_size) - } else { - get_random_intervals_days(sample_size) - }; + let sample_size = 10000000; + let timestamps1 = get_random_timestamps(sample_size); + let intervals = get_random_intervals(sample_size); // ts(sec) + interval(ns) = ts(sec); however, // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, // timestamps are more precise than intervals in tests. @@ -4868,10 +4732,22 @@ mod tests { for (idx, ts1) in timestamps1.iter().enumerate() { if idx % 2 == 0 { timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); - assert_eq!(intervals[idx], timestamp2.sub(ts1).unwrap()); + assert_eq!( + intervals[idx], + timestamp2.sub(ts1).unwrap(), + "operands: {:?} (-) {:?}", + ts1.add(intervals[idx].clone()).unwrap(), + ts1 + ); } else { timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); - assert_eq!(intervals[idx], ts1.sub(timestamp2).unwrap()); + assert_eq!( + intervals[idx], + ts1.sub(timestamp2).unwrap(), + "operands: {:?} (-) {:?}", + ts1, + ts1.sub(intervals[idx].clone()).unwrap() + ); }; } } @@ -5066,7 +4942,7 @@ mod tests { ] } - fn expected_results_months(sign: i32) -> Vec { + fn get_expected_results(sign: i32) -> Vec { vec![ ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))), ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( @@ -5086,10 +4962,10 @@ mod tests { sign * 250, ))), ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(sign * 2, 0, sign as i64 * 15_000), + IntervalMonthDayNanoType::make_value(0, sign * 59, sign as i64 * 15_000), )), ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(sign, sign, sign as i64 * 22), + IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), )), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( sign * 425, @@ -5101,43 +4977,8 @@ mod tests { ))), ] } - fn expected_results_days(sign: i32) -> Vec { - vec![ - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - 0, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - 0, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - 0, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - sign * 250, - ))), - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 59, sign as i64 * 15_000), - )), - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), - )), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 425, - sign * 86370000, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 15735, - 0, - ))), - ] - } - fn get_random_timestamps1(sample_size: u64) -> Vec { + fn get_random_timestamps(sample_size: u64) -> Vec { let vector_size = sample_size; let mut timestamp = vec![]; let mut rng = rand::thread_rng(); @@ -5160,7 +5001,11 @@ mod tests { None, )) } else if i % 4 == 1 { - let millisec = rng.gen_range(0..=999); + let rand = rng.gen_range(1..=999); + let millisec = if rand % 2 == 1 { rand } else { rand - 1 }; + // timestamps millisecs are always created with odd millisecs to prevent. + // such situations: timestamp(millisec) - interval(millisec) = timestamp(millisec) + // However, timestamp(millisec) - timestamp(millisec) = interval(month) timestamp.push(ScalarValue::TimestampMillisecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5172,7 +5017,7 @@ mod tests { None, )) } else if i % 4 == 2 { - let microsec = rng.gen_range(0..=999_999); + let microsec = rng.gen_range(1..=999_999); timestamp.push(ScalarValue::TimestampMicrosecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5184,7 +5029,8 @@ mod tests { None, )) } else if i % 4 == 3 { - let nanosec = rng.gen_range(0..=999_999_999); + let rand = rng.gen_range(1..=999_999_999); + let nanosec = if rand % 2 == 1 { rand } else { rand - 1 }; timestamp.push(ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5200,65 +5046,62 @@ mod tests { timestamp } - fn get_random_intervals_months(sample_size: u64) -> Vec { - let vector_size = sample_size; - let mut intervals = vec![]; - let mut rng = rand::thread_rng(); - for i in 0..vector_size { - if i % 3 == 2 && i % 4 == 3 { - let month = rng.gen_range(0..100); - // there is a complex test issue for the days 28(29). - // for example, if we have an expected interval 2 months 28(29) days, - // the subtractor finds it as 3 months if the previous timestamp - // is at february. - let day = rng.gen_range(0..=27); - let nanosec = rng.gen_range(0..86_400_000_000_000); - intervals.push(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(month, day, nanosec), - ))); - } else if i % 3 == 1 && i % 4 != 0 { - let day = rng.gen_range(0..5000); - let millisec = rng.gen_range(0..86_400_000); - intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(day, millisec), - ))) - } else { - let year = rng.gen_range(0..20); - let month = rng.gen_range(0..50); - intervals.push(ScalarValue::IntervalYearMonth(Some( - IntervalYearMonthType::make_value(year, month), - ))) - } - } - intervals - } - fn get_random_intervals_days(sample_size: u64) -> Vec { + fn get_random_intervals(sample_size: u64) -> Vec { let vector_size = sample_size; let mut intervals = vec![]; let mut rng = rand::thread_rng(); for i in 0..vector_size { if i % 4 == 0 { - let days = rng.gen_range(0..1000); - intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(days, 0), - ))) + let days = rng.gen_range(1..1000); + // To have variatons like timestamp(sec) + IntervalYearMonth and + // timestamp(sec) + IntervalDayTimeType(without millisec, since timestamps(sec) + + // interval(millisec) => timestamp(sec), we cannot forecast the resulting type). + // such conditions are added. + if i % 8 == 0 + || (days % 28 != 0) + || (days % 29 != 0) + || (days % 30 != 0) + || (days % 31 != 0) + { + intervals.push(ScalarValue::IntervalYearMonth(Some( + IntervalYearMonthType::make_value( + rng.gen_range(0..10), + rng.gen_range(0..500), + ), + ))) + } else { + intervals.push(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value(days, 0), + ))) + } } else if i % 4 == 1 { - let days = rng.gen_range(0..1000); - let millis = rng.gen_range(0..=86_400_000); + // interval millisecs are always created with even millisecs. + let days = rng.gen_range(1..1000); + let rand = rng.gen_range(0..86_400_000); + let millisec = if rand % 2 == 0 { rand } else { rand - 1 }; intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(days, millis), + IntervalDayTimeType::make_value(days, millisec), ))) } else if i % 4 == 2 { - let days = rng.gen_range(0..1000); - let millis = rng.gen_range(0..=86_400_000); + let days = rng.gen_range(1..1000); + let millisec = rng.gen_range(0..86_400_000); intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(days, millis), + IntervalDayTimeType::make_value(days, millisec), ))) } else { let days = rng.gen_range(0..1000); - let nanosecs = rng.gen_range(1..86_400_000_000_000); + let rand = rng.gen_range(1..86_400_000_000_000); + let nanosec = if rand % 2 == 0 { rand } else { rand - 1 }; intervals.push(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, days, nanosecs), + IntervalMonthDayNanoType::make_value( + 0, + days, + if nanosec % 1_000_000 == 0 { + nanosec - 1 + } else { + nanosec + }, + ), ))); } } From 423fb65714f4d3c407fa7870743a201578de8340 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 Mar 2023 14:38:47 +0300 Subject: [PATCH 08/55] clippy fix --- datafusion/common/src/scalar.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index e6f66e15e756..3a078cee3184 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -686,12 +686,12 @@ fn ts_nanosec_sub_to_interval( if months_residual_in_ns % 1_000_000 == 0 { // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day let as_millisec = delta_secs.num_milliseconds(); - return Ok(ScalarValue::IntervalDayTime(Some( + Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( sign * (as_millisec / 86_400_000) as i32, sign * (as_millisec % 86_400_000) as i32, ), - ))); + ))) } else { // 60 * 60 * 24 * 1000 * 1000 * 1000 = 86_400_000_000_000, number of nanosecs in a day // To show similar behaviour with Postgre, we do not use month field, and collect From 1291758c0ac6db07d0fca4bfc0536229c7f4f5d4 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 Mar 2023 14:41:00 +0300 Subject: [PATCH 09/55] toml conflict --- datafusion/common/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index fa0d0c71a60c..6a546f3fd70e 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -47,5 +47,5 @@ num_cpus = "1.13.0" object_store = { version = "0.5.4", default-features = false, optional = true } parquet = { version = "34.0.0", default-features = false, optional = true } pyo3 = { version = "0.18.0", optional = true } +sqlparser = "0.32" rand = "0.8.4" -sqlparser = "0.30" From d7f3696ee436ce154a76cecd7c28d4fa06e04f2d Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 Mar 2023 16:43:01 +0300 Subject: [PATCH 10/55] minor changes --- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/scalar.rs | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 554de6fa2efa..aa38ebd0e46c 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -47,6 +47,6 @@ num_cpus = "1.13.0" object_store = { version = "0.5.4", default-features = false, optional = true } parquet = { version = "34.0.0", default-features = false, optional = true } pyo3 = { version = "0.18.0", optional = true } -sqlparser = "0.32" rand = "0.8.4" +sqlparser = "0.32" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index e848b79f66a2..acbaa1ec2b2d 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -672,7 +672,7 @@ fn ts_nanosec_sub_to_interval( ) -> Result { let err = || { DataFusionError::Execution(String::from( - "nanosec overflow in timestamp subtractÅŸon", + "nanosec overflow in timestamp subtraction", )) }; // Conversion of integer and string-typed timestamps to NaiveDateTime objects @@ -4784,25 +4784,24 @@ mod tests { // ts(sec) + interval(ns) = ts(sec); however, // ts(sec) - ts(sec) cannot be = interval(ns). Therefore, // timestamps are more precise than intervals in tests. - let mut timestamp2: ScalarValue; for (idx, ts1) in timestamps1.iter().enumerate() { if idx % 2 == 0 { - timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); + let timestamp2 = ts1.add(intervals[idx].clone()).unwrap(); assert_eq!( intervals[idx], timestamp2.sub(ts1).unwrap(), "operands: {:?} (-) {:?}", - ts1.add(intervals[idx].clone()).unwrap(), + timestamp2, ts1 ); } else { - timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); + let timestamp2 = ts1.sub(intervals[idx].clone()).unwrap(); assert_eq!( intervals[idx], - ts1.sub(timestamp2).unwrap(), + ts1.sub(timestamp2.clone()).unwrap(), "operands: {:?} (-) {:?}", ts1, - ts1.sub(intervals[idx].clone()).unwrap() + timestamp2 ); }; } From 8d5c8e3eed1f89cb9e0718592e04285531803c3d Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Sat, 11 Mar 2023 19:47:51 +0300 Subject: [PATCH 11/55] deterministic matches --- datafusion/common/src/scalar.rs | 393 ++++++++++++++------------------ 1 file changed, 171 insertions(+), 222 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index acbaa1ec2b2d..10107306a614 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -43,9 +43,7 @@ use arrow::{ DECIMAL128_MAX_PRECISION, }, }; -use chrono::{ - DateTime, Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime, Timelike, -}; +use chrono::{DateTime, Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime}; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part to arrow's [`Array`]. @@ -506,54 +504,78 @@ macro_rules! impl_op { primitive_op!(lhs, rhs, Int8, $OPERATION) } ( - ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), + ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { - -1 => Ok(ts_nanosec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + -1 => { + let err = || { + DataFusionError::Execution( + "Overflow while conversion from second to millisecond" + .to_string(), + ) + }; + Ok(ts_sub_to_interval( + &ts_lhs.checked_mul(1_000).ok_or_else(err)?, + &ts_rhs.checked_mul(1_000).ok_or_else(err)?, + &tz_lhs, + &tz_rhs, + 1, + )?) + } _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for types {:?} and {:?}", + "Operator {} is not implemented for {:?} and {:?}", stringify!($OPERATION), $LHS, $RHS ))), }, ( - ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), + ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { - -1 => Ok(ts_microsec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + -1 => Ok(ts_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, 1)?), _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for types {:?} and {:?}", + "Operator {} is not implemented for {:?} and {:?}", stringify!($OPERATION), $LHS, $RHS ))), }, ( - ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), + ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { - -1 => Ok(ts_millisec_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, - )?), + -1 => { + let err = || { + DataFusionError::Execution( + "Overflow while conversion from microsecond to nanosecond" + .to_string(), + ) + }; + Ok(ts_sub_to_interval( + &ts_lhs.checked_mul(1_000).ok_or_else(err)?, + &ts_rhs.checked_mul(1_000).ok_or_else(err)?, + &tz_lhs, + &tz_rhs, + 1_000_000, + )?) + } _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for types {:?} and {:?}", + "Operator {} is not implemented for {:?} and {:?}", stringify!($OPERATION), $LHS, $RHS ))), }, ( - ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), + ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { - -1 => Ok(ts_sec_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs)?), + -1 => Ok(ts_sub_to_interval( + &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, 1_000_000, + )?), _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for types {:?} and {:?}", + "Operator {} is not implemented for {:?} and {:?}", stringify!($OPERATION), $LHS, $RHS @@ -619,146 +641,126 @@ macro_rules! get_sign { }; } -// all timestamp variants are converted to nanosecond scale -#[inline] -fn ts_microsec_sub_to_interval( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, -) -> Result { - let err_msg = "Overflow while conversion from microsecond to nanoseconds"; - let err = || DataFusionError::Execution(err_msg.to_string()); - let lhs_ns = lhs_ts.checked_mul(1_000).ok_or_else(err)?; - let rhs_ns = rhs_ts.checked_mul(1_000).ok_or_else(err)?; - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) -} -#[inline] -fn ts_millisec_sub_to_interval( +// Timestamp(sec) and Timestamp(millisec) difference is resulting as Interval(days, millis) +// Timestamp(microsec) and Tiemstamp(nanosec) difference is resulting as Interval(days, nanos) +fn ts_sub_to_interval( lhs_ts: &i64, rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, + scale_factor: i32, ) -> Result { - let err_msg = "Overflow while conversion from millisecond to nanoseconds"; - let err = || DataFusionError::Execution(err_msg.to_string()); - let lhs_ns = lhs_ts.checked_mul(1_000_000).ok_or_else(err)?; - let rhs_ns = rhs_ts.checked_mul(1_000_000).ok_or_else(err)?; - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) -} -#[inline] -fn ts_sec_sub_to_interval( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, -) -> Result { - let err_msg = "Overflow while conversion from second to nanoseconds"; - let err = || DataFusionError::Execution(err_msg.to_string()); - let lhs_ns = lhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; - let rhs_ns = rhs_ts.checked_mul(1_000_000_000).ok_or_else(err)?; - ts_nanosec_sub_to_interval(&lhs_ns, &rhs_ns, lhs_tz, rhs_tz) -} - -// Nanosecond-scale timestamps are subtracted to result in the narrowest interval variant. -// Interval variants are always consist of the same signed parts to handle comparison operations more wisely. -// For example, lhs < rhs => Interval(-, -, -), lhs > rhs => Interval(+, +, +) -// In month-day-nano format, month bits are always 0, the result is shown in days as the largest scale. -fn ts_nanosec_sub_to_interval( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, -) -> Result { - let err = || { - DataFusionError::Execution(String::from( - "nanosec overflow in timestamp subtraction", - )) - }; // Conversion of integer and string-typed timestamps to NaiveDateTime objects // Timezone offsets are added also if applicable. let (naive_date_time2_unchecked, naive_date_time1_unchecked) = - if let (Some(l), Some(r)) = (lhs_tz, rhs_tz) { - integer_w_timezone_to_naive_datetime(lhs_ts, rhs_ts, l, r)? - } else { - integer_to_naive_datetime(lhs_ts, rhs_ts)? - }; + with_timezone_to_naive_datetime(lhs_ts, rhs_ts, lhs_tz, rhs_tz, &scale_factor)?; // Check whether we will find a negative interval or not. let (naive_date_time2, naive_date_time1, sign) = find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); - // Subtraction of datetimes. Details are inside the function. - let (months, months_residual) = - datetime_month_sub_with_rem(naive_date_time2, naive_date_time1)?; - // Check whether we can return an IntervalYearMonth variant without losing information. - let months_residual_in_ns = months_residual.num_nanoseconds().ok_or_else(err)?; - if months_residual_in_ns == 0 { - return Ok(ScalarValue::IntervalYearMonth(Some(sign * months))); - } - - // Check whether we can return an IntervalDayTime variant without losing information let delta_secs = naive_date_time2.signed_duration_since(naive_date_time1); - if months_residual_in_ns % 1_000_000 == 0 { + + match scale_factor { // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day - let as_millisec = delta_secs.num_milliseconds(); - Ok(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value( - sign * (as_millisec / 86_400_000) as i32, - sign * (as_millisec % 86_400_000) as i32, - ), - ))) - } else { - // 60 * 60 * 24 * 1000 * 1000 * 1000 = 86_400_000_000_000, number of nanosecs in a day - // To show similar behaviour with Postgre, we do not use month field, and collect - // months in the day field. - let as_nanosec = delta_secs.num_nanoseconds().ok_or_else(err)?; - Ok(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value( - 0, - sign * (as_nanosec / 86_400_000_000_000) as i32, - sign as i64 * (as_nanosec % 86_400_000_000_000), - ), - ))) + 1 => { + let as_millisecs = delta_secs.num_milliseconds(); + Ok(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value( + sign * (as_millisecs / 86_400_000) as i32, + sign * (as_millisecs % 86_400_000) as i32, + ), + ))) + } + // 60 * 60 * 24 * 1000_000_000 = 86_400_000_000_000, number of nanosecs in a day + 1_000_000 => { + let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { + DataFusionError::Execution(String::from( + "timestamp difference cannot be shown in nanosecond precision", + )) + })?; + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + 0, + sign * (as_nanosecs / 86_400_000_000_000) as i32, + sign as i64 * (as_nanosecs % 86_400_000_000_000), + ), + ))) + } + _ => Err(DataFusionError::Execution(String::from( + "undefined scale factor", + ))), } } +#[inline] +fn with_timezone_to_naive_datetime( + lhs_ts: &i64, + rhs_ts: &i64, + lhs_tz: &Option, + rhs_tz: &Option, + scale_factor: &i32, +) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { + let (naive_lhs, naive_rhs) = match scale_factor { + 1 => ms_to_naive_datetime(lhs_ts, rhs_ts)?, + 1_000_000 => ns_to_naive_datetime(lhs_ts, rhs_ts)?, + _ => { + return Err(DataFusionError::Execution(String::from( + "undefined scale factor", + ))) + } + }; + match (lhs_tz, rhs_tz) { + (Some(l), Some(r)) => match (parse_tz_to_offset(l), parse_tz_to_offset(r)) { + (Ok(l), Ok(r)) => Ok(( + DateTime::::from_utc(naive_lhs, l).naive_local(), + DateTime::::from_utc(naive_rhs, r).naive_local(), + )), + (_, _) => Ok((naive_lhs, naive_rhs)), + }, + (_, _) => Ok((naive_lhs, naive_rhs)), + } +} #[inline] -fn integer_to_naive_datetime( - lhs_ts_ns: &i64, - rhs_ts_ns: &i64, +fn ms_to_naive_datetime( + lhs_ts_ms: &i64, + rhs_ts_ms: &i64, ) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { match ( NaiveDateTime::from_timestamp_opt( - lhs_ts_ns / 1_000_000_000, - (lhs_ts_ns % 1_000_000_000) as u32, + lhs_ts_ms / 1_000, + (lhs_ts_ms % 1_000) as u32 * 1_000_000, ), NaiveDateTime::from_timestamp_opt( - rhs_ts_ns / 1_000_000_000, - (rhs_ts_ns % 1_000_000_000) as u32, + rhs_ts_ms / 1_000, + (rhs_ts_ms % 1_000) as u32 * 1_000_000, ), ) { (Some(x), Some(y)) => Ok((x, y)), (x, y) => Err(DataFusionError::Execution(format!( - "timestamps {x:?} or {y:?} cannot be converted to datetimes", + "timestamps {x:?} or {y:?} cannot be converted to NaiveDateTime", ))), } } #[inline] -fn integer_w_timezone_to_naive_datetime( +fn ns_to_naive_datetime( lhs_ts_ns: &i64, rhs_ts_ns: &i64, - lhs_tz: &str, - rhs_tz: &str, ) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { - let (naive_lhs, naive_rhs) = integer_to_naive_datetime(lhs_ts_ns, rhs_ts_ns)?; - - match (parse_tz_to_offset(lhs_tz), parse_tz_to_offset(rhs_tz)) { - (Ok(l), Ok(r)) => Ok(( - DateTime::::from_utc(naive_lhs, l).naive_local(), - DateTime::::from_utc(naive_rhs, r).naive_local(), - )), - (_, _) => Ok((naive_lhs, naive_rhs)), + match ( + NaiveDateTime::from_timestamp_opt( + lhs_ts_ns / 1_000_000_000, + (lhs_ts_ns % 1_000_000_000) as u32, + ), + NaiveDateTime::from_timestamp_opt( + rhs_ts_ns / 1_000_000_000, + (rhs_ts_ns % 1_000_000_000) as u32, + ), + ) { + (Some(x), Some(y)) => Ok((x, y)), + (x, y) => Err(DataFusionError::Execution(format!( + "timestamps {x:?} or {y:?} cannot be converted to NaiveDateTime", + ))), } } // This function parses as the format of "+HH:MM", for example, "+05:30" @@ -794,31 +796,6 @@ fn find_interval_sign( (ndt2, ndt1, 1) } } -// This function assumes 'date_time2' is greater than 'date_time1', -// therefore; resulting 'months' cannot be negative. -#[inline] -fn datetime_month_sub_with_rem( - date_time2: NaiveDateTime, - date_time1: NaiveDateTime, -) -> Result<(i32, Duration), DataFusionError> { - // The difference of total months. - let months = (date_time2.year() - date_time1.year()) * 12 - + (date_time2.month() as i32 - date_time1.month() as i32); - - // months_residual is in the form of X secs, Y nanosecs. - // Y cannot be larger than 1_000_000_000, it is rounded up to seconds. - // The subtractions may overflow, so cast i64. - let months_residual = - Duration::days(date_time2.day() as i64 - date_time1.day() as i64) - + Duration::hours(date_time2.hour() as i64 - date_time1.hour() as i64) - + Duration::minutes(date_time2.minute() as i64 - date_time1.minute() as i64) - + Duration::seconds(date_time2.second() as i64 - date_time1.second() as i64) - + Duration::nanoseconds( - date_time2.nanosecond() as i64 - date_time1.nanosecond() as i64, - ); - - Ok((months, months_residual)) -} #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { @@ -4778,7 +4755,7 @@ mod tests { fn timestamp_op_random_tests() { // timestamp1 + (or -) interval = timestamp2 // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 10000000; + let sample_size = 1000000; let timestamps1 = get_random_timestamps(sample_size); let intervals = get_random_intervals(sample_size); // ts(sec) + interval(ns) = ts(sec); however, @@ -4790,7 +4767,8 @@ mod tests { assert_eq!( intervals[idx], timestamp2.sub(ts1).unwrap(), - "operands: {:?} (-) {:?}", + "index:{}, operands: {:?} (-) {:?}", + idx, timestamp2, ts1 ); @@ -4799,7 +4777,8 @@ mod tests { assert_eq!( intervals[idx], ts1.sub(timestamp2.clone()).unwrap(), - "operands: {:?} (-) {:?}", + "index:{}, operands: {:?} (-) {:?}", + idx, ts1, timestamp2 ); @@ -4831,7 +4810,7 @@ mod tests { ), ScalarValue::TimestampMillisecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(2023, 2, 11) .unwrap() .and_hms_milli_opt(10, 10, 0, 000) .unwrap() @@ -4999,18 +4978,19 @@ mod tests { fn get_expected_results(sign: i32) -> Vec { vec![ - ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))), - ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( - 0, - sign * 2, - ))), - ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, 0, 0), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, sign * 59, 0), + )), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 41, 0, - sign * 2, ))), - ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, 0, - sign * 2, ))), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( sign * 59, @@ -5026,9 +5006,9 @@ mod tests { sign * 425, sign * 86370000, ))), - ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( - sign * 43, - sign, + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 15735, + 0, ))), ] } @@ -5040,7 +5020,7 @@ mod tests { for i in 0..vector_size { let year = rng.gen_range(1995..=2050); let month = rng.gen_range(1..=12); - let day = rng.gen_range(1..=28); + let day = rng.gen_range(1..=28); // to exclude invalid dates let hour = rng.gen_range(0..=23); let minute = rng.gen_range(0..=59); let second = rng.gen_range(0..=59); @@ -5056,11 +5036,7 @@ mod tests { None, )) } else if i % 4 == 1 { - let rand = rng.gen_range(1..=999); - let millisec = if rand % 2 == 1 { rand } else { rand - 1 }; - // timestamps millisecs are always created with odd millisecs to prevent. - // such situations: timestamp(millisec) - interval(millisec) = timestamp(millisec) - // However, timestamp(millisec) - timestamp(millisec) = interval(month) + let millisec = rng.gen_range(0..=999); timestamp.push(ScalarValue::TimestampMillisecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5072,7 +5048,7 @@ mod tests { None, )) } else if i % 4 == 2 { - let microsec = rng.gen_range(1..=999_999); + let microsec = rng.gen_range(0..=999_999); timestamp.push(ScalarValue::TimestampMicrosecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5084,8 +5060,7 @@ mod tests { None, )) } else if i % 4 == 3 { - let rand = rng.gen_range(1..=999_999_999); - let nanosec = if rand % 2 == 1 { rand } else { rand - 1 }; + let nanosec = rng.gen_range(0..=999_999_999); timestamp.push(ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(year, month, day) @@ -5107,56 +5082,30 @@ mod tests { let mut rng = rand::thread_rng(); for i in 0..vector_size { if i % 4 == 0 { - let days = rng.gen_range(1..1000); - // To have variatons like timestamp(sec) + IntervalYearMonth and - // timestamp(sec) + IntervalDayTimeType(without millisec, since timestamps(sec) + - // interval(millisec) => timestamp(sec), we cannot forecast the resulting type). - // such conditions are added. - if i % 8 == 0 - || (days % 28 != 0) - || (days % 29 != 0) - || (days % 30 != 0) - || (days % 31 != 0) - { - intervals.push(ScalarValue::IntervalYearMonth(Some( - IntervalYearMonthType::make_value( - rng.gen_range(0..10), - rng.gen_range(0..500), - ), - ))) - } else { - intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(days, 0), - ))) - } - } else if i % 4 == 1 { - // interval millisecs are always created with even millisecs. - let days = rng.gen_range(1..1000); - let rand = rng.gen_range(0..86_400_000); - let millisec = if rand % 2 == 0 { rand } else { rand - 1 }; + let days = rng.gen_range(0..5000); + // to not break second precision + let millis = rng.gen_range(0..86_400) * 1000; intervals.push(ScalarValue::IntervalDayTime(Some( - IntervalDayTimeType::make_value(days, millisec), + IntervalDayTimeType::make_value(days, millis), ))) - } else if i % 4 == 2 { - let days = rng.gen_range(1..1000); + } else if i % 4 == 1 { + let days = rng.gen_range(0..5000); let millisec = rng.gen_range(0..86_400_000); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millisec), ))) + } else if i % 4 == 2 { + let days = rng.gen_range(0..5000); + // to not break microsec precision + let nanosec = rng.gen_range(0..86_400_000_000) * 1000; + intervals.push(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, days, nanosec), + ))) } else { - let days = rng.gen_range(0..1000); - let rand = rng.gen_range(1..86_400_000_000_000); - let nanosec = if rand % 2 == 0 { rand } else { rand - 1 }; + let days = rng.gen_range(0..5000); + let nanosec = rng.gen_range(0..86_400_000_000_000); intervals.push(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value( - 0, - days, - if nanosec % 1_000_000 == 0 { - nanosec - 1 - } else { - nanosec - }, - ), + IntervalMonthDayNanoType::make_value(0, days, nanosec), ))); } } From 31577d95a59601cd4c4c19df1aa158fa32d5667e Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Sun, 12 Mar 2023 17:57:27 +0300 Subject: [PATCH 12/55] simplifications (clippy error) --- datafusion-cli/Cargo.lock | 68 ++++++++++++++++----------------- datafusion/common/src/scalar.rs | 34 +++++------------ 2 files changed, 44 insertions(+), 58 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 68ff002f7ad6..a5e40209fe1d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -356,9 +356,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] @@ -546,9 +546,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3ad85c1f65dc7b37604eb0e89748faf0b9653065f2a8ef69f96a687ec1e9279" +checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" [[package]] name = "core-foundation-sys" @@ -1023,9 +1023,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "531ac96c6ff5fd7c62263c5e3c67a603af4fcaee2e1a0ae5565ba3a11e69e549" dependencies = [ "futures-channel", "futures-core", @@ -1038,9 +1038,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "164713a5a0dcc3e7b4b1ed7d3b433cabc18025386f9339346e8daf15963cf7ac" dependencies = [ "futures-core", "futures-sink", @@ -1048,15 +1048,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "86d7a0c1aa76363dac491de0ee99faf6941128376f1cf96f07db7603b7de69dd" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "1997dd9df74cdac935c76252744c1ed5794fac083242ea4fe77ef3ed60ba0f83" dependencies = [ "futures-core", "futures-task", @@ -1065,15 +1065,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "89d422fa3cbe3b40dca574ab087abb5bc98258ea57eea3fd6f1fa7162c778b91" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6" dependencies = [ "proc-macro2", "quote", @@ -1082,21 +1082,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "ec93083a4aecafb2a80a885c9de1f0ccae9dbd32c2bb54b0c3a65690e0b8d2f2" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "fd65540d33b37b16542a0438c12e6aeead10d4ac5d05bd3f805b8f35ab592879" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "3ef6b17e481503ec85211fed8f39d1970f128935ca1f814cd32ac4a6842e84ab" dependencies = [ "futures-channel", "futures-core", @@ -1247,9 +1247,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.24" +version = "0.14.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e011372fa0b68db8350aa7a248930ecc7839bf46d8485577d69f117a75f164c" +checksum = "cc5e554ff619822309ffd57d8734d77cd5ce6238bc956f037ea06c58238c9899" dependencies = [ "bytes", "futures-channel", @@ -1462,9 +1462,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.139" +version = "0.2.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" [[package]] name = "libm" @@ -2172,18 +2172,18 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.153" +version = "1.0.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a382c72b4ba118526e187430bb4963cd6d55051ebf13d9b25574d379cc98d20" +checksum = "71f2b4817415c6d4210bfe1c7bfcf4801b2d904cb4d0e1a8fdb651013c9e86b8" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.153" +version = "1.0.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ef476a5790f0f6decbc66726b6e5d63680ed518283e64c7df415989d880954f" +checksum = "d071a94a3fac4aff69d023a7f411e33f40f3483f8c5190b1953822b6b76d7630" dependencies = [ "proc-macro2", "quote", @@ -2569,9 +2569,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "524b68aca1d05e03fdf03fcdce2c6c94b6daf6d16861ddaa7e4f2b6638a9052c" [[package]] name = "unicode-ident" @@ -2619,9 +2619,9 @@ dependencies = [ [[package]] name = "utf8parse" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936e4b492acfd135421d8dca4b1aa80a7bfc26e702ef3af710e0752684df5372" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 10107306a614..69b3e1841ce3 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -648,27 +648,24 @@ fn ts_sub_to_interval( rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, - scale_factor: i32, + scale_factor: i64, ) -> Result { // Conversion of integer and string-typed timestamps to NaiveDateTime objects // Timezone offsets are added also if applicable. - let (naive_date_time2_unchecked, naive_date_time1_unchecked) = + let (naive_date_time2, naive_date_time1) = with_timezone_to_naive_datetime(lhs_ts, rhs_ts, lhs_tz, rhs_tz, &scale_factor)?; - // Check whether we will find a negative interval or not. - let (naive_date_time2, naive_date_time1, sign) = - find_interval_sign(naive_date_time2_unchecked, naive_date_time1_unchecked); - let delta_secs = naive_date_time2.signed_duration_since(naive_date_time1); + // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day + let number_of_millisecs_in_day: i64 = 86_400_000; match scale_factor { - // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day 1 => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( - sign * (as_millisecs / 86_400_000) as i32, - sign * (as_millisecs % 86_400_000) as i32, + (as_millisecs / number_of_millisecs_in_day) as i32, + (as_millisecs % number_of_millisecs_in_day) as i32, ), ))) } @@ -682,8 +679,8 @@ fn ts_sub_to_interval( Ok(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value( 0, - sign * (as_nanosecs / 86_400_000_000_000) as i32, - sign as i64 * (as_nanosecs % 86_400_000_000_000), + (as_nanosecs / (number_of_millisecs_in_day * scale_factor)) as i32, + as_nanosecs % (number_of_millisecs_in_day * scale_factor), ), ))) } @@ -698,7 +695,7 @@ fn with_timezone_to_naive_datetime( rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, - scale_factor: &i32, + scale_factor: &i64, ) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { let (naive_lhs, naive_rhs) = match scale_factor { 1 => ms_to_naive_datetime(lhs_ts, rhs_ts)?, @@ -785,17 +782,6 @@ fn parse_tz_to_offset(tz: &str) -> Result { }; Ok(timezone_offset) } -#[inline] -fn find_interval_sign( - ndt2: NaiveDateTime, - ndt1: NaiveDateTime, -) -> (NaiveDateTime, NaiveDateTime, i32) { - if ndt2.timestamp_nanos() < ndt1.timestamp_nanos() { - (ndt1, ndt2, -1) - } else { - (ndt2, ndt1, 1) - } -} #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { @@ -4755,7 +4741,7 @@ mod tests { fn timestamp_op_random_tests() { // timestamp1 + (or -) interval = timestamp2 // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 1000000; + let sample_size = 100000; let timestamps1 = get_random_timestamps(sample_size); let intervals = get_random_intervals(sample_size); // ts(sec) + interval(ns) = ts(sec); however, From c274aefb1d2484fa8aa50c0006d58187f1a1d4a9 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 13 Mar 2023 13:32:14 +0300 Subject: [PATCH 13/55] test format changed --- datafusion-cli/Cargo.lock | 16 +- datafusion/common/src/scalar.rs | 499 ++++++++++++++++---------------- 2 files changed, 265 insertions(+), 250 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a5e40209fe1d..d253d1d90a97 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -440,9 +440,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" dependencies = [ "iana-time-zone", "num-integer", @@ -1889,9 +1889,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "1d0e1ae9e836cc3beddd63db0df682593d7e2d3d891ae8c9083d2113e1744224" dependencies = [ "unicode-ident", ] @@ -1908,9 +1908,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.23" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "50686e0021c4136d1d453b2dfe059902278681512a34d4248435dc34b6b5c8ec" dependencies = [ "proc-macro2", ] @@ -2160,9 +2160,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" +checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "seq-macro" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 69b3e1841ce3..963e60555144 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -519,7 +519,7 @@ macro_rules! impl_op { &ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, - 1, + IntervalMode::Milli, )?) } _ => Err(DataFusionError::Internal(format!( @@ -533,7 +533,13 @@ macro_rules! impl_op { ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { - -1 => Ok(ts_sub_to_interval(&ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, 1)?), + -1 => Ok(ts_sub_to_interval( + &ts_lhs, + &ts_rhs, + &tz_lhs, + &tz_rhs, + IntervalMode::Milli, + )?), _ => Err(DataFusionError::Internal(format!( "Operator {} is not implemented for {:?} and {:?}", stringify!($OPERATION), @@ -557,7 +563,7 @@ macro_rules! impl_op { &ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, - 1_000_000, + IntervalMode::Nano, )?) } _ => Err(DataFusionError::Internal(format!( @@ -572,7 +578,11 @@ macro_rules! impl_op { ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), ) => match get_sign!($OPERATION) { -1 => Ok(ts_sub_to_interval( - &ts_lhs, &ts_rhs, &tz_lhs, &tz_rhs, 1_000_000, + &ts_lhs, + &ts_rhs, + &tz_lhs, + &tz_rhs, + IntervalMode::Nano, )?), _ => Err(DataFusionError::Internal(format!( "Operator {} is not implemented for {:?} and {:?}", @@ -641,6 +651,10 @@ macro_rules! get_sign { }; } +enum IntervalMode { + Milli, + Nano, +} // Timestamp(sec) and Timestamp(millisec) difference is resulting as Interval(days, millis) // Timestamp(microsec) and Tiemstamp(nanosec) difference is resulting as Interval(days, nanos) fn ts_sub_to_interval( @@ -648,19 +662,19 @@ fn ts_sub_to_interval( rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, - scale_factor: i64, + mode: IntervalMode, ) -> Result { // Conversion of integer and string-typed timestamps to NaiveDateTime objects // Timezone offsets are added also if applicable. let (naive_date_time2, naive_date_time1) = - with_timezone_to_naive_datetime(lhs_ts, rhs_ts, lhs_tz, rhs_tz, &scale_factor)?; + with_timezone_to_naive_datetime(lhs_ts, rhs_ts, lhs_tz, rhs_tz, &mode)?; let delta_secs = naive_date_time2.signed_duration_since(naive_date_time1); // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day let number_of_millisecs_in_day: i64 = 86_400_000; - match scale_factor { - 1 => { + match mode { + IntervalMode::Milli => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( @@ -670,7 +684,7 @@ fn ts_sub_to_interval( ))) } // 60 * 60 * 24 * 1000_000_000 = 86_400_000_000_000, number of nanosecs in a day - 1_000_000 => { + IntervalMode::Nano => { let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "timestamp difference cannot be shown in nanosecond precision", @@ -679,14 +693,11 @@ fn ts_sub_to_interval( Ok(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value( 0, - (as_nanosecs / (number_of_millisecs_in_day * scale_factor)) as i32, - as_nanosecs % (number_of_millisecs_in_day * scale_factor), + (as_nanosecs / (number_of_millisecs_in_day * 1_000_000)) as i32, + as_nanosecs % (number_of_millisecs_in_day * 1_000_000), ), ))) } - _ => Err(DataFusionError::Execution(String::from( - "undefined scale factor", - ))), } } #[inline] @@ -695,16 +706,11 @@ fn with_timezone_to_naive_datetime( rhs_ts: &i64, lhs_tz: &Option, rhs_tz: &Option, - scale_factor: &i64, + mode: &IntervalMode, ) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { - let (naive_lhs, naive_rhs) = match scale_factor { - 1 => ms_to_naive_datetime(lhs_ts, rhs_ts)?, - 1_000_000 => ns_to_naive_datetime(lhs_ts, rhs_ts)?, - _ => { - return Err(DataFusionError::Execution(String::from( - "undefined scale factor", - ))) - } + let (naive_lhs, naive_rhs) = match mode { + IntervalMode::Milli => ms_to_naive_datetime(lhs_ts, rhs_ts)?, + IntervalMode::Nano => ns_to_naive_datetime(lhs_ts, rhs_ts)?, }; match (lhs_tz, rhs_tz) { @@ -4711,37 +4717,23 @@ mod tests { #[test] fn timestamp_op_tests() { // positive interval, edge cases - let vec_timestamps_next = timestamps_next(); - let vec_timestamps_prev = timestamps_prev(); - let expected_results = get_expected_results(1); - - for (idx, exp) in expected_results.iter().enumerate() { - assert_eq!( - *exp, - vec_timestamps_next[idx] - .sub(&vec_timestamps_prev[idx]) - .unwrap() - ) + let test_data = get_test_data(1); + + for (idx, exp) in test_data.iter().enumerate() { + assert_eq!(exp.2, test_data[idx].0.sub(&test_data[idx].1).unwrap()) } // negative interval, edge cases - let vec_timestamps_next = timestamps_prev(); - let vec_timestamps_prev = timestamps_next(); - let expected_results = get_expected_results(-1); - for (idx, exp) in expected_results.iter().enumerate() { - assert_eq!( - *exp, - vec_timestamps_next[idx] - .sub(&vec_timestamps_prev[idx]) - .unwrap() - ); + let test_data = get_test_data(-1); + for (idx, exp) in test_data.iter().enumerate() { + assert_eq!(exp.2, test_data[idx].1.sub(&test_data[idx].0).unwrap()); } } #[test] fn timestamp_op_random_tests() { // timestamp1 + (or -) interval = timestamp2 // timestamp2 - timestamp1 (or timestamp1 - timestamp2) = interval ? - let sample_size = 100000; + let sample_size = 1000000; let timestamps1 = get_random_timestamps(sample_size); let intervals = get_random_intervals(sample_size); // ts(sec) + interval(ns) = ts(sec); however, @@ -4772,231 +4764,254 @@ mod tests { } } - fn timestamps_next() -> Vec { - vec![ - ScalarValue::TimestampNanosecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_nano_opt(1, 0, 0, 000_000_000) - .unwrap() - .timestamp_nanos(), + fn get_test_data(sign: i32) -> Vec<(ScalarValue, ScalarValue, ScalarValue)> { + let test_data = vec![ + ( + // 1. test case + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_nano_opt(1, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + Some("+01:00".to_string()), ), - Some("+01:00".to_string()), - ), - ScalarValue::TimestampMicrosecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_micro_opt(2, 0, 0, 000_000) - .unwrap() - .timestamp_micros(), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + Some("+00:00".to_string()), ), - Some("+01:00".to_string()), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, 0, 0), + )), ), - ScalarValue::TimestampMillisecond( - Some( - NaiveDate::from_ymd_opt(2023, 2, 11) - .unwrap() - .and_hms_milli_opt(10, 10, 0, 000) - .unwrap() - .timestamp_millis(), + // 2. test case + ( + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_micro_opt(2, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + Some("+01:00".to_string()), ), - Some("+10:10".to_string()), - ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap() - .timestamp(), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + Some("-01:00".to_string()), ), - Some("-11:59".to_string()), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, sign * 59, 0), + )), ), - ScalarValue::TimestampMillisecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_milli_opt(23, 58, 0, 250) - .unwrap() - .timestamp_millis(), + // 3. test case + ( + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 2, 11) + .unwrap() + .and_hms_milli_opt(10, 10, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("+10:10".to_string()), ), - Some("+11:59".to_string()), - ), - ScalarValue::TimestampMicrosecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_micro_opt(0, 0, 0, 15) - .unwrap() - .timestamp_micros(), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_milli_opt(1, 0, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("+01:00".to_string()), ), - None, + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 41, + 0, + ))), ), - ScalarValue::TimestampNanosecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_nano_opt(0, 0, 0, 22) - .unwrap() - .timestamp_nanos(), + // 4. test case + ( + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + Some("-11:59".to_string()), ), - None, - ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(2023, 3, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap() - .timestamp(), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_opt(23, 58, 0) + .unwrap() + .timestamp(), + ), + Some("+11:59".to_string()), ), - None, + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + 0, + ))), ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(2023, 12, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap() - .timestamp(), + // 5. test case + ( + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_milli_opt(23, 58, 0, 250) + .unwrap() + .timestamp_millis(), + ), + Some("+11:59".to_string()), ), - None, - ), - ] - } - - fn timestamps_prev() -> Vec { - vec![ - ScalarValue::TimestampNanosecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_nano_opt(0, 0, 0, 000_000_000) - .unwrap() - .timestamp_nanos(), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_milli_opt(0, 0, 0, 000) + .unwrap() + .timestamp_millis(), + ), + Some("-11:59".to_string()), ), - Some("+00:00".to_string()), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 59, + sign * 250, + ))), ), - ScalarValue::TimestampMicrosecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_micro_opt(0, 0, 0, 000_000) - .unwrap() - .timestamp_micros(), + // 6. test case + ( + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 15) + .unwrap() + .timestamp_micros(), + ), + None, ), - Some("-01:00".to_string()), - ), - ScalarValue::TimestampMillisecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_milli_opt(1, 0, 0, 000) - .unwrap() - .timestamp_millis(), + ScalarValue::TimestampMicrosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_micro_opt(0, 0, 0, 000_000) + .unwrap() + .timestamp_micros(), + ), + None, ), - Some("+01:00".to_string()), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + 0, + sign * 59, + sign as i64 * 15_000, + ), + )), ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_opt(23, 58, 0) - .unwrap() - .timestamp(), + // 7. test case + ( + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 22) + .unwrap() + .timestamp_nanos(), + ), + None, ), - Some("+11:59".to_string()), - ), - ScalarValue::TimestampMillisecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_milli_opt(0, 0, 0, 000) - .unwrap() - .timestamp_millis(), + ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 31) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 000_000_000) + .unwrap() + .timestamp_nanos(), + ), + None, ), - Some("-11:59".to_string()), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), + )), ), - ScalarValue::TimestampMicrosecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 1) - .unwrap() - .and_hms_micro_opt(0, 0, 0, 000_000) - .unwrap() - .timestamp_micros(), + // 8. test case + ( + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, ), - None, - ), - ScalarValue::TimestampNanosecond( - Some( - NaiveDate::from_ymd_opt(2023, 1, 31) - .unwrap() - .and_hms_nano_opt(0, 0, 0, 000_000_000) - .unwrap() - .timestamp_nanos(), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2021, 12, 30) + .unwrap() + .and_hms_opt(0, 0, 30) + .unwrap() + .timestamp(), + ), + None, ), - None, + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 425, + sign * 86370000, + ))), ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(2021, 12, 30) - .unwrap() - .and_hms_opt(0, 0, 30) - .unwrap() - .timestamp(), + // 9. test case + ( + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 12, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, ), - None, - ), - ScalarValue::TimestampSecond( - Some( - NaiveDate::from_ymd_opt(1980, 11, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap() - .timestamp(), + ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(1980, 11, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, ), - None, + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + sign * 15735, + 0, + ))), ), - ] - } + ]; - fn get_expected_results(sign: i32) -> Vec { - vec![ - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, 0, 0), - )), - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 59, 0), - )), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 41, - 0, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - 0, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - sign * 250, - ))), - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 59, sign as i64 * 15_000), - )), - ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), - )), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 425, - sign * 86370000, - ))), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 15735, - 0, - ))), - ] + test_data } fn get_random_timestamps(sample_size: u64) -> Vec { From 968a6824af09ccb9a5581d8c238ebac1ab0dff1f Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 13 Mar 2023 13:57:46 +0300 Subject: [PATCH 14/55] minor test fix --- datafusion/common/src/scalar.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 963e60555144..cee1c4a4bc61 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -4719,14 +4719,14 @@ mod tests { // positive interval, edge cases let test_data = get_test_data(1); - for (idx, exp) in test_data.iter().enumerate() { - assert_eq!(exp.2, test_data[idx].0.sub(&test_data[idx].1).unwrap()) + for (lhs, rhs, expected) in test_data.iter() { + assert_eq!(expected, &lhs.sub(rhs).unwrap()) } // negative interval, edge cases let test_data = get_test_data(-1); - for (idx, exp) in test_data.iter().enumerate() { - assert_eq!(exp.2, test_data[idx].1.sub(&test_data[idx].0).unwrap()); + for (rhs, lhs, expected) in test_data.iter() { + assert_eq!(expected, &lhs.sub(rhs).unwrap()); } } #[test] From ed637796a7177ecc52986a06b948a696fe80a00f Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 13 Mar 2023 15:32:10 +0300 Subject: [PATCH 15/55] Update scalar.rs --- datafusion/common/src/scalar.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index cee1c4a4bc61..46e6a25a74a0 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -4767,7 +4767,7 @@ mod tests { fn get_test_data(sign: i32) -> Vec<(ScalarValue, ScalarValue, ScalarValue)> { let test_data = vec![ ( - // 1. test case + // 1st test case ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -4792,7 +4792,7 @@ mod tests { IntervalMonthDayNanoType::make_value(0, 0, 0), )), ), - // 2. test case + // 2nd test case ( ScalarValue::TimestampMicrosecond( Some( @@ -4818,7 +4818,7 @@ mod tests { IntervalMonthDayNanoType::make_value(0, sign * 59, 0), )), ), - // 3. test case + // 3rd test case ( ScalarValue::TimestampMillisecond( Some( @@ -4845,7 +4845,7 @@ mod tests { 0, ))), ), - // 4. test case + // 4th test case ( ScalarValue::TimestampSecond( Some( @@ -4872,7 +4872,7 @@ mod tests { 0, ))), ), - // 5. test case + // 5th test case ( ScalarValue::TimestampMillisecond( Some( @@ -4899,7 +4899,7 @@ mod tests { sign * 250, ))), ), - // 6. test case + // 6th test case ( ScalarValue::TimestampMicrosecond( Some( @@ -4929,7 +4929,7 @@ mod tests { ), )), ), - // 7. test case + // 7th test case ( ScalarValue::TimestampNanosecond( Some( @@ -4955,7 +4955,7 @@ mod tests { IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), )), ), - // 8. test case + // 8th test case ( ScalarValue::TimestampSecond( Some( @@ -4982,7 +4982,7 @@ mod tests { sign * 86370000, ))), ), - // 9. test case + // 9th test case ( ScalarValue::TimestampSecond( Some( From 68ea6479333620fe583f601608cd24454832442a Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 13 Mar 2023 18:10:59 -0500 Subject: [PATCH 16/55] Refactoring and simplifications --- datafusion-cli/Cargo.lock | 39 ++-- datafusion/common/Cargo.toml | 3 +- datafusion/common/src/scalar.rs | 364 ++++++++++++++------------------ 3 files changed, 178 insertions(+), 228 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 6a2763502b9e..84b4ec7101f8 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -741,7 +741,6 @@ dependencies = [ "num_cpus", "object_store", "parquet", - "rand", "sqlparser", ] @@ -1908,9 +1907,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.24" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50686e0021c4136d1d453b2dfe059902278681512a34d4248435dc34b6b5c8ec" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -2175,7 +2174,6 @@ name = "serde" version = "1.0.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71f2b4817415c6d4210bfe1c7bfcf4801b2d904cb4d0e1a8fdb651013c9e86b8" - dependencies = [ "serde_derive", ] @@ -2185,7 +2183,6 @@ name = "serde_derive" version = "1.0.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d071a94a3fac4aff69d023a7f411e33f40f3483f8c5190b1953822b6b76d7630" - dependencies = [ "proc-macro2", "quote", @@ -2832,9 +2829,9 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -2847,45 +2844,45 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "winreg" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 4780962f3e8e..444ce9a2e0ae 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -47,6 +47,7 @@ num_cpus = "1.13.0" object_store = { version = "0.5.4", default-features = false, optional = true } parquet = { workspace = true, default-features = false, optional = true } pyo3 = { version = "0.18.0", optional = true } -rand = "0.8.4" sqlparser = "0.32" +[dev-dependencies] +rand = "0.8.4" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 46e6a25a74a0..bedbd1a328c5 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -464,6 +464,71 @@ macro_rules! unsigned_subtraction_error { } macro_rules! impl_op { + ($LHS:expr, $RHS:expr, +) => { + impl_op_symmetric!($LHS, $RHS, +) + }; + ($LHS:expr, $RHS:expr, -) => { + match ($LHS, $RHS) { + ( + ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), + ) => { + let err = || { + DataFusionError::Execution( + "Overflow while converting seconds to milliseconds".to_string(), + ) + }; + ts_sub_to_interval( + ts_lhs.checked_mul(1_000).ok_or_else(err)?, + ts_rhs.checked_mul(1_000).ok_or_else(err)?, + &tz_lhs, + &tz_rhs, + IntervalMode::Milli, + ) + }, + ( + ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), + ) => ts_sub_to_interval( + *ts_lhs, + *ts_rhs, + tz_lhs, + tz_rhs, + IntervalMode::Milli, + ), + ( + ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), + ) => { + let err = || { + DataFusionError::Execution( + "Overflow while converting microseconds to nanoseconds".to_string(), + ) + }; + ts_sub_to_interval( + ts_lhs.checked_mul(1_000).ok_or_else(err)?, + ts_rhs.checked_mul(1_000).ok_or_else(err)?, + tz_lhs, + tz_rhs, + IntervalMode::Nano, + ) + }, + ( + ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), + ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), + ) => ts_sub_to_interval( + *ts_lhs, + *ts_rhs, + tz_lhs, + tz_rhs, + IntervalMode::Nano, + ), + _ => impl_op_symmetric!($LHS, $RHS, -) + } + }; +} + +macro_rules! impl_op_symmetric { ($LHS:expr, $RHS:expr, $OPERATION:tt) => { match ($LHS, $RHS) { // Binary operations on arguments with the same type: @@ -503,94 +568,6 @@ macro_rules! impl_op { (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => { primitive_op!(lhs, rhs, Int8, $OPERATION) } - ( - ScalarValue::TimestampSecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampSecond(Some(ts_rhs), tz_rhs), - ) => match get_sign!($OPERATION) { - -1 => { - let err = || { - DataFusionError::Execution( - "Overflow while conversion from second to millisecond" - .to_string(), - ) - }; - Ok(ts_sub_to_interval( - &ts_lhs.checked_mul(1_000).ok_or_else(err)?, - &ts_rhs.checked_mul(1_000).ok_or_else(err)?, - &tz_lhs, - &tz_rhs, - IntervalMode::Milli, - )?) - } - _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for {:?} and {:?}", - stringify!($OPERATION), - $LHS, - $RHS - ))), - }, - ( - ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), - ) => match get_sign!($OPERATION) { - -1 => Ok(ts_sub_to_interval( - &ts_lhs, - &ts_rhs, - &tz_lhs, - &tz_rhs, - IntervalMode::Milli, - )?), - _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for {:?} and {:?}", - stringify!($OPERATION), - $LHS, - $RHS - ))), - }, - ( - ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampMicrosecond(Some(ts_rhs), tz_rhs), - ) => match get_sign!($OPERATION) { - -1 => { - let err = || { - DataFusionError::Execution( - "Overflow while conversion from microsecond to nanosecond" - .to_string(), - ) - }; - Ok(ts_sub_to_interval( - &ts_lhs.checked_mul(1_000).ok_or_else(err)?, - &ts_rhs.checked_mul(1_000).ok_or_else(err)?, - &tz_lhs, - &tz_rhs, - IntervalMode::Nano, - )?) - } - _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for {:?} and {:?}", - stringify!($OPERATION), - $LHS, - $RHS - ))), - }, - ( - ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), - ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), - ) => match get_sign!($OPERATION) { - -1 => Ok(ts_sub_to_interval( - &ts_lhs, - &ts_rhs, - &tz_lhs, - &tz_rhs, - IntervalMode::Nano, - )?), - _ => Err(DataFusionError::Internal(format!( - "Operator {} is not implemented for {:?} and {:?}", - stringify!($OPERATION), - $LHS, - $RHS - ))), - }, // Binary operations on arguments with different types: (ScalarValue::Date32(Some(days)), _) => { let value = date32_add(*days, $RHS, get_sign!($OPERATION))?; @@ -651,184 +628,156 @@ macro_rules! get_sign { }; } +#[derive(Clone, Copy)] enum IntervalMode { Milli, Nano, } -// Timestamp(sec) and Timestamp(millisec) difference is resulting as Interval(days, millis) -// Timestamp(microsec) and Tiemstamp(nanosec) difference is resulting as Interval(days, nanos) + +/// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones +/// into account when given. Units of the resulting interval is specified by +/// the argument `mode`. +/// The default behavior of Datafusion is the following: +/// - When subtracting timestamps at seconds/milliseconds precision, the output +/// interval will have the type [`IntervalDayTimeType`]. +/// - When subtracting timestamps at microseconds/nanoseconds precision, the +/// output interval will have the type [`IntervalMonthDayNano`]. fn ts_sub_to_interval( - lhs_ts: &i64, - rhs_ts: &i64, + lhs_ts: i64, + rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, mode: IntervalMode, ) -> Result { - // Conversion of integer and string-typed timestamps to NaiveDateTime objects - // Timezone offsets are added also if applicable. - let (naive_date_time2, naive_date_time1) = - with_timezone_to_naive_datetime(lhs_ts, rhs_ts, lhs_tz, rhs_tz, &mode)?; - - let delta_secs = naive_date_time2.signed_duration_since(naive_date_time1); + let lhs_dt = with_timezone_to_naive_datetime(lhs_ts, lhs_tz, mode)?; + let rhs_dt = with_timezone_to_naive_datetime(rhs_ts, rhs_tz, mode)?; + let delta_secs = lhs_dt.signed_duration_since(rhs_dt); - // 60 * 60 * 24 * 1000 = 86_400_000, number of millisecs in a day - let number_of_millisecs_in_day: i64 = 86_400_000; match mode { IntervalMode::Milli => { + const MILLISECS_IN_ONE_DAY: i64 = 86_400_000; let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( - (as_millisecs / number_of_millisecs_in_day) as i32, - (as_millisecs % number_of_millisecs_in_day) as i32, + (as_millisecs / MILLISECS_IN_ONE_DAY) as i32, + (as_millisecs % MILLISECS_IN_ONE_DAY) as i32, ), ))) } - // 60 * 60 * 24 * 1000_000_000 = 86_400_000_000_000, number of nanosecs in a day IntervalMode::Nano => { + const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( - "timestamp difference cannot be shown in nanosecond precision", + "Can not compute timestamp differences with nanosecond precision", )) })?; Ok(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value( 0, - (as_nanosecs / (number_of_millisecs_in_day * 1_000_000)) as i32, - as_nanosecs % (number_of_millisecs_in_day * 1_000_000), + (as_nanosecs / NANOSECS_IN_ONE_DAY) as i32, + as_nanosecs % NANOSECS_IN_ONE_DAY, ), ))) } } } + +/// This function creates the [`NaiveDateTime`] object corresponding to the +/// given timestamp using the units (tick size) implied by argument `mode`. #[inline] fn with_timezone_to_naive_datetime( - lhs_ts: &i64, - rhs_ts: &i64, - lhs_tz: &Option, - rhs_tz: &Option, - mode: &IntervalMode, -) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { - let (naive_lhs, naive_rhs) = match mode { - IntervalMode::Milli => ms_to_naive_datetime(lhs_ts, rhs_ts)?, - IntervalMode::Nano => ns_to_naive_datetime(lhs_ts, rhs_ts)?, + ts: i64, + tz: &Option, + mode: IntervalMode, +) -> Result { + let mut result = if let IntervalMode::Milli = mode { + ticks_to_naive_datetime::<1_000_000>(ts) + } else { + ticks_to_naive_datetime::<1>(ts) + }?; + if let Some(tz) = tz { + let offset = parse_tz_to_offset(tz)?; + result = DateTime::::from_utc(result, offset).naive_local(); }; - - match (lhs_tz, rhs_tz) { - (Some(l), Some(r)) => match (parse_tz_to_offset(l), parse_tz_to_offset(r)) { - (Ok(l), Ok(r)) => Ok(( - DateTime::::from_utc(naive_lhs, l).naive_local(), - DateTime::::from_utc(naive_rhs, r).naive_local(), - )), - (_, _) => Ok((naive_lhs, naive_rhs)), - }, - (_, _) => Ok((naive_lhs, naive_rhs)), - } -} -#[inline] -fn ms_to_naive_datetime( - lhs_ts_ms: &i64, - rhs_ts_ms: &i64, -) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { - match ( - NaiveDateTime::from_timestamp_opt( - lhs_ts_ms / 1_000, - (lhs_ts_ms % 1_000) as u32 * 1_000_000, - ), - NaiveDateTime::from_timestamp_opt( - rhs_ts_ms / 1_000, - (rhs_ts_ms % 1_000) as u32 * 1_000_000, - ), - ) { - (Some(x), Some(y)) => Ok((x, y)), - (x, y) => Err(DataFusionError::Execution(format!( - "timestamps {x:?} or {y:?} cannot be converted to NaiveDateTime", - ))), - } + Ok(result) } + +/// This function creates the [`NaiveDateTime`] object corresponding to the +/// given timestamp, whose tick size is specified by `UNIT_NANOS`. #[inline] -fn ns_to_naive_datetime( - lhs_ts_ns: &i64, - rhs_ts_ns: &i64, -) -> Result<(NaiveDateTime, NaiveDateTime), DataFusionError> { - match ( - NaiveDateTime::from_timestamp_opt( - lhs_ts_ns / 1_000_000_000, - (lhs_ts_ns % 1_000_000_000) as u32, - ), - NaiveDateTime::from_timestamp_opt( - rhs_ts_ns / 1_000_000_000, - (rhs_ts_ns % 1_000_000_000) as u32, - ), - ) { - (Some(x), Some(y)) => Ok((x, y)), - (x, y) => Err(DataFusionError::Execution(format!( - "timestamps {x:?} or {y:?} cannot be converted to NaiveDateTime", - ))), - } +fn ticks_to_naive_datetime(ticks: i64) -> Result { + NaiveDateTime::from_timestamp_opt( + (ticks * UNIT_NANOS) / 1_000_000_000, + ((ticks * UNIT_NANOS) % 1_000_000_000) as u32, + ) + .ok_or_else(|| { + DataFusionError::Execution( + "Can not convert given timestamp to a NaiveDateTime".to_string(), + ) + }) } -// This function parses as the format of "+HH:MM", for example, "+05:30" -#[inline] -fn parse_tz_to_offset(tz: &str) -> Result { - let err_str = &String::from("error while parsing timezone"); - let err = || DataFusionError::Execution(err_str.to_string()); - let sign = tz.chars().next().ok_or_else(err)?; +/// This function parses `tz` according to the format "+HH:MM" (e.g. "+05:30") +/// and retuns a [`FixedOffset`] object. +#[inline] +fn parse_tz_to_offset(tz: &str) -> Result { + const ERR_MSG: &str = "Can not parse timezone"; + let sign = tz + .chars() + .next() + .ok_or_else(|| DataFusionError::Execution(ERR_MSG.to_string()))?; let hours = tz[1..3] .parse::() - .map_err(|_e| DataFusionError::Execution(err_str.to_string()))?; + .map_err(|_| DataFusionError::Execution(ERR_MSG.to_string()))?; let minutes = tz[4..6] .parse::() - .map_err(|_e| DataFusionError::Execution(err_str.to_string()))?; - let timezone_offset = match sign { - '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, - '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60).ok_or_else(err)?, - _ => { - return Err(DataFusionError::Execution(err_str.to_string())); - } - }; - Ok(timezone_offset) + .map_err(|_| DataFusionError::Execution(ERR_MSG.to_string()))?; + match sign { + '-' => FixedOffset::east_opt(hours * 3600 + minutes * 60), + '+' => FixedOffset::west_opt(hours * 3600 + minutes * 60), + _ => None, + } + .ok_or_else(|| DataFusionError::Execution(ERR_MSG.to_string())) } #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); let prior = epoch.add(Duration::days(days as i64)); - let posterior = do_date_math(prior, scalar, sign)?; - Ok(posterior.sub(epoch).num_days() as i32) + do_date_math(prior, scalar, sign).map(|d| d.sub(epoch).num_days() as i32) } #[inline] pub fn date64_add(ms: i64, scalar: &ScalarValue, sign: i32) -> Result { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); let prior = epoch.add(Duration::milliseconds(ms)); - let posterior = do_date_math(prior, scalar, sign)?; - Ok(posterior.sub(epoch).num_milliseconds()) + do_date_math(prior, scalar, sign).map(|d| d.sub(epoch).num_milliseconds()) } #[inline] pub fn seconds_add(ts_s: i64, scalar: &ScalarValue, sign: i32) -> Result { - Ok(do_date_time_math(ts_s, 0, scalar, sign)?.timestamp()) + do_date_time_math(ts_s, 0, scalar, sign).map(|dt| dt.timestamp()) } #[inline] pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_ms / 1000; let nsecs = ((ts_ms % 1000) * 1_000_000) as u32; - Ok(do_date_time_math(secs, nsecs, scalar, sign)?.timestamp_millis()) + do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_millis()) } #[inline] pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_us / 1_000_000; let nsecs = ((ts_us % 1_000_000) * 1000) as u32; - Ok(do_date_time_math(secs, nsecs, scalar, sign)?.timestamp_nanos() / 1000) + do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_nanos() / 1000) } #[inline] pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_ns / 1_000_000_000; let nsecs = (ts_ns % 1_000_000_000) as u32; - Ok(do_date_time_math(secs, nsecs, scalar, sign)?.timestamp_nanos()) + do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_nanos()) } #[inline] @@ -4717,16 +4666,15 @@ mod tests { #[test] fn timestamp_op_tests() { // positive interval, edge cases - let test_data = get_test_data(1); - - for (lhs, rhs, expected) in test_data.iter() { - assert_eq!(expected, &lhs.sub(rhs).unwrap()) + let test_data = get_timestamp_test_data(1); + for (lhs, rhs, expected) in test_data.into_iter() { + assert_eq!(expected, lhs.sub(rhs).unwrap()) } // negative interval, edge cases - let test_data = get_test_data(-1); - for (rhs, lhs, expected) in test_data.iter() { - assert_eq!(expected, &lhs.sub(rhs).unwrap()); + let test_data = get_timestamp_test_data(-1); + for (rhs, lhs, expected) in test_data.into_iter() { + assert_eq!(expected, lhs.sub(rhs).unwrap()); } } #[test] @@ -4764,8 +4712,10 @@ mod tests { } } - fn get_test_data(sign: i32) -> Vec<(ScalarValue, ScalarValue, ScalarValue)> { - let test_data = vec![ + fn get_timestamp_test_data( + sign: i32, + ) -> Vec<(ScalarValue, ScalarValue, ScalarValue)> { + vec![ ( // 1st test case ScalarValue::TimestampNanosecond( @@ -5009,9 +4959,7 @@ mod tests { 0, ))), ), - ]; - - test_data + ] } fn get_random_timestamps(sample_size: u64) -> Vec { @@ -5081,30 +5029,34 @@ mod tests { let vector_size = sample_size; let mut intervals = vec![]; let mut rng = rand::thread_rng(); + const SECS_IN_ONE_DAY: i32 = 86_400; + const MILLISECS_IN_ONE_DAY: i32 = 86_400_000; + const MICROSECS_IN_ONE_DAY: i64 = 86_400_000_000; + const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; for i in 0..vector_size { if i % 4 == 0 { let days = rng.gen_range(0..5000); // to not break second precision - let millis = rng.gen_range(0..86_400) * 1000; + let millis = rng.gen_range(0..SECS_IN_ONE_DAY) * 1000; intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millis), ))) } else if i % 4 == 1 { let days = rng.gen_range(0..5000); - let millisec = rng.gen_range(0..86_400_000); + let millisec = rng.gen_range(0..MILLISECS_IN_ONE_DAY); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millisec), ))) } else if i % 4 == 2 { let days = rng.gen_range(0..5000); // to not break microsec precision - let nanosec = rng.gen_range(0..86_400_000_000) * 1000; + let nanosec = rng.gen_range(0..MICROSECS_IN_ONE_DAY) * 1000; intervals.push(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value(0, days, nanosec), ))) } else { let days = rng.gen_range(0..5000); - let nanosec = rng.gen_range(0..86_400_000_000_000); + let nanosec = rng.gen_range(0..NANOSECS_IN_ONE_DAY); intervals.push(ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value(0, days, nanosec), ))); From ed0446676a8f3bb35b0b3544d195fd94f600cfbe Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 13 Mar 2023 22:23:02 -0500 Subject: [PATCH 17/55] Make ScalarValue support interval comparison --- datafusion/common/src/scalar.rs | 486 +++++++++++++++++++++++++++++++- 1 file changed, 481 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index bedbd1a328c5..3d7ff1314f27 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -45,6 +45,12 @@ use arrow::{ }; use chrono::{DateTime, Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime}; +// Constants we use throughout this file: +const MILLISECS_IN_ONE_DAY: i64 = 86_400_000; +const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; +const MILLISECS_IN_ONE_MONTH: i64 = 2_592_000_000; // assuming 30 days. +const NANOSECS_IN_ONE_MONTH: i128 = 2_592_000_000_000_000; // assuming 30 days. + /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part to arrow's [`Array`]. /// @@ -199,10 +205,28 @@ impl PartialEq for ScalarValue { (TimestampNanosecond(v1, _), TimestampNanosecond(v2, _)) => v1.eq(v2), (TimestampNanosecond(_, _), _) => false, (IntervalYearMonth(v1), IntervalYearMonth(v2)) => v1.eq(v2), + (IntervalYearMonth(v1), IntervalDayTime(v2)) => { + ym_to_milli(v1).eq(&dt_to_milli(v2)) + } + (IntervalYearMonth(v1), IntervalMonthDayNano(v2)) => { + ym_to_nano(v1).eq(&mdn_to_nano(v2)) + } (IntervalYearMonth(_), _) => false, (IntervalDayTime(v1), IntervalDayTime(v2)) => v1.eq(v2), + (IntervalDayTime(v1), IntervalYearMonth(v2)) => { + dt_to_milli(v1).eq(&ym_to_milli(v2)) + } + (IntervalDayTime(v1), IntervalMonthDayNano(v2)) => { + dt_to_nano(v1).eq(&mdn_to_nano(v2)) + } (IntervalDayTime(_), _) => false, (IntervalMonthDayNano(v1), IntervalMonthDayNano(v2)) => v1.eq(v2), + (IntervalMonthDayNano(v1), IntervalYearMonth(v2)) => { + mdn_to_nano(v1).eq(&ym_to_nano(v2)) + } + (IntervalMonthDayNano(v1), IntervalDayTime(v2)) => { + mdn_to_nano(v1).eq(&dt_to_nano(v2)) + } (IntervalMonthDayNano(_), _) => false, (Struct(v1, t1), Struct(v2, t2)) => v1.eq(v2) && t1.eq(t2), (Struct(_, _), _) => false, @@ -304,10 +328,28 @@ impl PartialOrd for ScalarValue { } (TimestampNanosecond(_, _), _) => None, (IntervalYearMonth(v1), IntervalYearMonth(v2)) => v1.partial_cmp(v2), + (IntervalYearMonth(v1), IntervalDayTime(v2)) => { + ym_to_milli(v1).partial_cmp(&dt_to_milli(v2)) + } + (IntervalYearMonth(v1), IntervalMonthDayNano(v2)) => { + ym_to_nano(v1).partial_cmp(&mdn_to_nano(v2)) + } (IntervalYearMonth(_), _) => None, (IntervalDayTime(v1), IntervalDayTime(v2)) => v1.partial_cmp(v2), + (IntervalDayTime(v1), IntervalYearMonth(v2)) => { + dt_to_milli(v1).partial_cmp(&ym_to_milli(v2)) + } + (IntervalDayTime(v1), IntervalMonthDayNano(v2)) => { + dt_to_nano(v1).partial_cmp(&mdn_to_nano(v2)) + } (IntervalDayTime(_), _) => None, (IntervalMonthDayNano(v1), IntervalMonthDayNano(v2)) => v1.partial_cmp(v2), + (IntervalMonthDayNano(v1), IntervalYearMonth(v2)) => { + mdn_to_nano(v1).partial_cmp(&ym_to_nano(v2)) + } + (IntervalMonthDayNano(v1), IntervalDayTime(v2)) => { + mdn_to_nano(v1).partial_cmp(&dt_to_nano(v2)) + } (IntervalMonthDayNano(_), _) => None, (Struct(v1, t1), Struct(v2, t2)) => { if t1.eq(t2) { @@ -332,6 +374,52 @@ impl PartialOrd for ScalarValue { } } +/// This function computes the duration (in milliseconds) of the given +/// year-month-interval. +#[inline] +fn ym_to_milli(val: &Option) -> Option { + val.map(|value| (value as i64) * MILLISECS_IN_ONE_MONTH) +} + +/// This function computes the duration (in nanoseconds) of the given +/// year-month-interval. +#[inline] +fn ym_to_nano(val: &Option) -> Option { + val.map(|value| (value as i128) * NANOSECS_IN_ONE_MONTH) +} + +/// This function computes the duration (in milliseconds) of the given +/// daytime-interval. +#[inline] +fn dt_to_milli(val: &Option) -> Option { + val.map(|val| { + let (days, millis) = IntervalDayTimeType::to_parts(val); + (days as i64) * MILLISECS_IN_ONE_DAY + (millis as i64) + }) +} + +/// This function computes the duration (in nanoseconds) of the given +/// daytime-interval. +#[inline] +fn dt_to_nano(val: &Option) -> Option { + val.map(|val| { + let (days, millis) = IntervalDayTimeType::to_parts(val); + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + (millis as i128) * 1_000_000 + }) +} + +/// This function computes the duration (in nanoseconds) of the given +/// month-day-nano-interval. Assumes a month is 30 days long. +#[inline] +fn mdn_to_nano(val: &Option) -> Option { + val.map(|val| { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(val); + (months as i128) * NANOSECS_IN_ONE_MONTH + + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + + (nanos as i128) + }) +} + impl Eq for ScalarValue {} // TODO implement this in arrow-rs with simd @@ -568,6 +656,43 @@ macro_rules! impl_op_symmetric { (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => { primitive_op!(lhs, rhs, Int8, $OPERATION) } + ( + ScalarValue::IntervalYearMonth(Some(lhs)), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => Ok(ScalarValue::IntervalYearMonth(Some( + IntervalYearMonthType::make_value(0, lhs + rhs * get_sign!($OPERATION)), + ))), + ( + ScalarValue::IntervalDayTime(Some(lhs)), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => { + let sign = get_sign!($OPERATION); + let (lhs_days, lhs_millis) = IntervalDayTimeType::to_parts(*lhs); + let (rhs_days, rhs_millis) = IntervalDayTimeType::to_parts(*rhs); + Ok(ScalarValue::IntervalDayTime(Some( + IntervalDayTimeType::make_value( + lhs_days + rhs_days * sign, + lhs_millis + rhs_millis * sign, + ), + ))) + } + ( + ScalarValue::IntervalMonthDayNano(Some(lhs)), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => { + let sign = get_sign!($OPERATION); + let (lhs_months, lhs_days, lhs_nanos) = + IntervalMonthDayNanoType::to_parts(*lhs); + let (rhs_months, rhs_days, rhs_nanos) = + IntervalMonthDayNanoType::to_parts(*rhs); + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value( + lhs_months + rhs_months * sign, + lhs_days + rhs_days * sign, + lhs_nanos + rhs_nanos * (sign as i64), + ), + ))) + } // Binary operations on arguments with different types: (ScalarValue::Date32(Some(days)), _) => { let value = date32_add(*days, $RHS, get_sign!($OPERATION))?; @@ -609,6 +734,30 @@ macro_rules! impl_op_symmetric { let value = nanoseconds_add(*ts_ns, $LHS, get_sign!($OPERATION))?; Ok(ScalarValue::TimestampNanosecond(Some(value), zone.clone())) } + ( + ScalarValue::IntervalYearMonth(Some(lhs)), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => op_ym_dt(*lhs, *rhs, get_sign!($OPERATION), false), + ( + ScalarValue::IntervalYearMonth(Some(lhs)), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => op_ym_mdn(*lhs, *rhs, get_sign!($OPERATION), false), + ( + ScalarValue::IntervalDayTime(Some(lhs)), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => op_ym_dt(*rhs, *lhs, get_sign!($OPERATION), true), + ( + ScalarValue::IntervalDayTime(Some(lhs)), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => op_dt_mdn(*lhs, *rhs, get_sign!($OPERATION), false), + ( + ScalarValue::IntervalMonthDayNano(Some(lhs)), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => op_ym_mdn(*rhs, *lhs, get_sign!($OPERATION), true), + ( + ScalarValue::IntervalMonthDayNano(Some(lhs)), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => op_dt_mdn(*rhs, *lhs, get_sign!($OPERATION), true), _ => Err(DataFusionError::Internal(format!( "Operator {} is not implemented for types {:?} and {:?}", stringify!($OPERATION), @@ -619,6 +768,72 @@ macro_rules! impl_op_symmetric { }; } +/// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of different +/// types ([`IntervalYearMonthType`] and [`IntervalDayTimeType`], respectively). +/// The argument `sign` chooses between addition and subtraction, the argument +/// `commute` swaps `lhs` and `rhs`. The return value is an interval [`ScalarValue`] +/// with type data type [`IntervalMonthDayNanoType`]. +#[inline] +fn op_ym_dt(mut lhs: i32, rhs: i64, sign: i32, commute: bool) -> Result { + let (mut days, millis) = IntervalDayTimeType::to_parts(rhs); + let mut nanos = (millis as i64) * 1_000_000; + if commute { + lhs *= sign; + } else { + days *= sign; + nanos *= sign as i64; + }; + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(lhs, days, nanos), + ))) +} + +/// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of different +/// types ([`IntervalYearMonthType`] and [`IntervalMonthDayNanoType`], respectively). +/// The argument `sign` chooses between addition and subtraction, the argument +/// `commute` swaps `lhs` and `rhs`. The return value is an interval [`ScalarValue`] +/// with type data type [`IntervalMonthDayNanoType`]. +#[inline] +fn op_ym_mdn(lhs: i32, rhs: i128, sign: i32, commute: bool) -> Result { + let (mut months, mut days, mut nanos) = IntervalMonthDayNanoType::to_parts(rhs); + if commute { + months += lhs * sign; + } else { + months = lhs + (months * sign); + days *= sign; + nanos *= sign as i64; + } + Ok(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(months, days, nanos), + ))) +} + +/// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of different +/// types ([`IntervalDayTimeType`] and [`IntervalMonthDayNanoType`], respectively). +/// The argument `sign` chooses between addition and subtraction, the argument +/// `commute` swaps `lhs` and `rhs`. The return value is an interval [`ScalarValue`] +/// with type data type [`IntervalMonthDayNanoType`]. +#[inline] +fn op_dt_mdn(lhs: i64, rhs: i128, sign: i32, commute: bool) -> Result { + let (lhs_days, lhs_millis) = IntervalDayTimeType::to_parts(lhs); + let (rhs_months, rhs_days, rhs_nanos) = IntervalMonthDayNanoType::to_parts(rhs); + + let result = if commute { + IntervalMonthDayNanoType::make_value( + rhs_months, + lhs_days * sign + rhs_days, + (lhs_millis * sign) as i64 * 1_000_000 + rhs_nanos, + ) + } else { + IntervalMonthDayNanoType::make_value( + rhs_months * sign, + lhs_days + rhs_days * sign, + (lhs_millis as i64) * 1_000_000 + rhs_nanos * (sign as i64), + ) + }; + Ok(ScalarValue::IntervalMonthDayNano(Some(result))) +} + macro_rules! get_sign { (+) => { 1 @@ -655,7 +870,6 @@ fn ts_sub_to_interval( match mode { IntervalMode::Milli => { - const MILLISECS_IN_ONE_DAY: i64 = 86_400_000; let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value( @@ -665,7 +879,6 @@ fn ts_sub_to_interval( ))) } IntervalMode::Nano => { - const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "Can not compute timestamp differences with nanosecond precision", @@ -3883,6 +4096,53 @@ mod tests { ])), None ); + // Different type of intervals can be compared. + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(1, 2))) + < IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 14, 0, 1 + ))), + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 4))) + >= IntervalDayTime(Some(IntervalDayTimeType::make_value(119, 1))) + ); + assert!( + IntervalDayTime(Some(IntervalDayTimeType::make_value(12, 86_399_999))) + >= IntervalDayTime(Some(IntervalDayTimeType::make_value(12, 0))) + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(2, 12))) + == IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 36, 0, 0 + ))), + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 0))) + != IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 1))) + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(1, 4))) + == IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 16))), + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 3))) + > IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 2, + 28, + 999_999_999 + ))), + ); + assert!( + IntervalYearMonth(Some(IntervalYearMonthType::make_value(0, 1))) + > IntervalDayTime(Some(IntervalDayTimeType::make_value(29, 9_999))), + ); + assert!( + IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value(1, 12, 34))) + > IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 0, 142, 34 + ))) + ); } #[test] @@ -4663,6 +4923,224 @@ mod tests { } } + #[test] + fn test_scalar_interval_add() { + let cases = [ + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 12, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 12, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 2, 24, + ))), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 1, 999, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 1, 999, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 2, 1998, + ))), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(24, 30, 246_912), + )), + ), + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, 1, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 29, 86_390, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(1, 29, 86_390_000_000), + )), + ), + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(2, 10, 999_999_999), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(3, 10, 999_999_999), + )), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 400, 123_456, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(13, 400, 123_456_000_000), + )), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 65, 321, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(2, 5, 1_000_000), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(2, 70, 322_000_000), + )), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 2, 0, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(36, 15, 123_456), + )), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 100_000), + )), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 370, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 385, 1_100_000), + )), + ), + ]; + for (lhs, rhs, expected) in cases.iter() { + let result = lhs.add(rhs).unwrap(); + let result_commute = rhs.add(lhs).unwrap(); + assert_eq!(*expected, result, "lhs:{:?} + rhs:{:?}", lhs, rhs); + assert_eq!(*expected, result_commute, "lhs:{:?} + rhs:{:?}", rhs, lhs); + } + } + + #[test] + fn test_scalar_interval_sub() { + let cases = [ + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 12, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 12, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, 0, + ))), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 1, 999, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 1, 999, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, 0, 0), + )), + ), + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, 1, + ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 29, 999_999, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(1, -29, -999_999_000_000), + )), + ), + ( + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 0, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(2, 10, 999_999_999), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(-1, -10, -999_999_999), + )), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 400, 123_456, + ))), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 1, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(-13, 400, 123_456_000_000), + )), + ), + ( + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 65, 321, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(2, 5, 1_000_000), + )), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(-2, 60, 320_000_000), + )), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 123_456), + )), + ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( + 2, 0, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(-12, 15, 123_456), + )), + ), + ( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, 15, 100_000), + )), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( + 370, 1, + ))), + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(12, -355, -900_000), + )), + ), + ]; + for (lhs, rhs, expected) in cases.iter() { + let result = lhs.sub(rhs).unwrap(); + assert_eq!(*expected, result, "lhs:{:?} - rhs:{:?}", lhs, rhs); + } + } + #[test] fn timestamp_op_tests() { // positive interval, edge cases @@ -5030,9 +5508,7 @@ mod tests { let mut intervals = vec![]; let mut rng = rand::thread_rng(); const SECS_IN_ONE_DAY: i32 = 86_400; - const MILLISECS_IN_ONE_DAY: i32 = 86_400_000; const MICROSECS_IN_ONE_DAY: i64 = 86_400_000_000; - const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; for i in 0..vector_size { if i % 4 == 0 { let days = rng.gen_range(0..5000); @@ -5043,7 +5519,7 @@ mod tests { ))) } else if i % 4 == 1 { let days = rng.gen_range(0..5000); - let millisec = rng.gen_range(0..MILLISECS_IN_ONE_DAY); + let millisec = rng.gen_range(0..(MILLISECS_IN_ONE_DAY as i32)); intervals.push(ScalarValue::IntervalDayTime(Some( IntervalDayTimeType::make_value(days, millisec), ))) From 3bf8fd6e1b9b0542bfc5a886acc13a6410b26360 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 14 Mar 2023 11:26:37 +0300 Subject: [PATCH 18/55] naming tests --- datafusion/common/src/scalar.rs | 93 +++++++++++++++++---------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 3d7ff1314f27..a9e25f1883d3 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -856,7 +856,7 @@ enum IntervalMode { /// - When subtracting timestamps at seconds/milliseconds precision, the output /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the -/// output interval will have the type [`IntervalMonthDayNano`]. +/// output interval will have the type [`IntervalMonthDayNanoType`]. fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, @@ -5195,16 +5195,18 @@ mod tests { ) -> Vec<(ScalarValue, ScalarValue, ScalarValue)> { vec![ ( - // 1st test case + // 1st test case, having the same time but different with timezones + // Since they are timestamps with nanosecond precision, expected type is + // [`IntervalMonthDayNanoType`] ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) .unwrap() - .and_hms_nano_opt(1, 0, 0, 000_000_000) + .and_hms_nano_opt(12, 0, 0, 000_000_000) .unwrap() .timestamp_nanos(), ), - Some("+01:00".to_string()), + Some("+12:00".to_string()), ), ScalarValue::TimestampNanosecond( Some( @@ -5220,7 +5222,7 @@ mod tests { IntervalMonthDayNanoType::make_value(0, 0, 0), )), ), - // 2nd test case + // 2nd test case, january with 31 days plus february with 28 days, with timezone ( ScalarValue::TimestampMicrosecond( Some( @@ -5246,11 +5248,11 @@ mod tests { IntervalMonthDayNanoType::make_value(0, sign * 59, 0), )), ), - // 3rd test case + // 3rd test case, 29-days long february minus previous, year with timezone ( ScalarValue::TimestampMillisecond( Some( - NaiveDate::from_ymd_opt(2023, 2, 11) + NaiveDate::from_ymd_opt(2024, 2, 29) .unwrap() .and_hms_milli_opt(10, 10, 0, 000) .unwrap() @@ -5260,7 +5262,7 @@ mod tests { ), ScalarValue::TimestampMillisecond( Some( - NaiveDate::from_ymd_opt(2023, 1, 1) + NaiveDate::from_ymd_opt(2023, 12, 31) .unwrap() .and_hms_milli_opt(1, 0, 0, 000) .unwrap() @@ -5269,15 +5271,16 @@ mod tests { Some("+01:00".to_string()), ), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 41, + sign * 60, 0, ))), ), - // 4th test case + // 4th test case, leap years occur mostly every 4 years, but every 100 years + // we skip a leap year unless the year is divisible by 400, so 31 + 28 = 59 ( ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(2100, 3, 1) .unwrap() .and_hms_opt(0, 0, 0) .unwrap() @@ -5287,7 +5290,7 @@ mod tests { ), ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(2023, 1, 1) + NaiveDate::from_ymd_opt(2100, 1, 1) .unwrap() .and_hms_opt(23, 58, 0) .unwrap() @@ -5300,17 +5303,18 @@ mod tests { 0, ))), ), - // 5th test case + // 5th test case, without timezone positively seemed, but with timezone, + // negative resulting interval ( ScalarValue::TimestampMillisecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(2023, 1, 1) .unwrap() - .and_hms_milli_opt(23, 58, 0, 250) + .and_hms_milli_opt(6, 00, 0, 000) .unwrap() .timestamp_millis(), ), - Some("+11:59".to_string()), + Some("+06:00".to_string()), ), ScalarValue::TimestampMillisecond( Some( @@ -5320,20 +5324,20 @@ mod tests { .unwrap() .timestamp_millis(), ), - Some("-11:59".to_string()), + Some("-12:00".to_string()), ), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 59, - sign * 250, + 0, + sign * -43_200_000, ))), ), - // 6th test case + // 6th test case, no problem before unix epoch beginning ( ScalarValue::TimestampMicrosecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(1970, 1, 1) .unwrap() - .and_hms_micro_opt(0, 0, 0, 15) + .and_hms_micro_opt(1, 2, 3, 15) .unwrap() .timestamp_micros(), ), @@ -5341,7 +5345,7 @@ mod tests { ), ScalarValue::TimestampMicrosecond( Some( - NaiveDate::from_ymd_opt(2023, 1, 1) + NaiveDate::from_ymd_opt(1969, 1, 1) .unwrap() .and_hms_micro_opt(0, 0, 0, 000_000) .unwrap() @@ -5352,18 +5356,18 @@ mod tests { ScalarValue::IntervalMonthDayNano(Some( IntervalMonthDayNanoType::make_value( 0, - sign * 59, - sign as i64 * 15_000, + 365 * sign, + sign as i64 * 3_723_000_015_000, ), )), ), - // 7th test case + // 7th test case, no problem with big intervals ( ScalarValue::TimestampNanosecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(2100, 1, 1) .unwrap() - .and_hms_nano_opt(0, 0, 0, 22) + .and_hms_nano_opt(0, 0, 0, 0) .unwrap() .timestamp_nanos(), ), @@ -5371,7 +5375,7 @@ mod tests { ), ScalarValue::TimestampNanosecond( Some( - NaiveDate::from_ymd_opt(2023, 1, 31) + NaiveDate::from_ymd_opt(2000, 1, 1) .unwrap() .and_hms_nano_opt(0, 0, 0, 000_000_000) .unwrap() @@ -5380,14 +5384,14 @@ mod tests { None, ), ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNanoType::make_value(0, sign * 29, sign as i64 * 22), + IntervalMonthDayNanoType::make_value(0, sign * 36525, 0), )), ), - // 8th test case + // 8th test case, no problem detecting 366-days long years ( ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(2023, 3, 1) + NaiveDate::from_ymd_opt(2041, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) .unwrap() @@ -5397,45 +5401,42 @@ mod tests { ), ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(2021, 12, 30) + NaiveDate::from_ymd_opt(2040, 1, 1) .unwrap() - .and_hms_opt(0, 0, 30) + .and_hms_opt(0, 0, 0) .unwrap() .timestamp(), ), None, ), ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 425, - sign * 86370000, + sign * 366, + 0, ))), ), - // 9th test case + // 9th test case, no problem with unrealistic timezones ( ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(2023, 12, 1) + NaiveDate::from_ymd_opt(2023, 1, 3) .unwrap() .and_hms_opt(0, 0, 0) .unwrap() .timestamp(), ), - None, + Some("+23:59".to_string()), ), ScalarValue::TimestampSecond( Some( - NaiveDate::from_ymd_opt(1980, 11, 1) + NaiveDate::from_ymd_opt(2023, 1, 1) .unwrap() - .and_hms_opt(0, 0, 0) + .and_hms_opt(0, 2, 0) .unwrap() .timestamp(), ), - None, + Some("-23:59".to_string()), ), - ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value( - sign * 15735, - 0, - ))), + ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))), ), ] } From 0f8a7a74536d08404a0a66d4c0f987232762ed53 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 14 Mar 2023 18:48:57 +0300 Subject: [PATCH 19/55] macro renaming --- datafusion/common/src/scalar.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index a9e25f1883d3..0570cbe851a5 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -553,7 +553,7 @@ macro_rules! unsigned_subtraction_error { macro_rules! impl_op { ($LHS:expr, $RHS:expr, +) => { - impl_op_symmetric!($LHS, $RHS, +) + impl_op_dissociated!($LHS, $RHS, +) }; ($LHS:expr, $RHS:expr, -) => { match ($LHS, $RHS) { @@ -611,12 +611,12 @@ macro_rules! impl_op { tz_rhs, IntervalMode::Nano, ), - _ => impl_op_symmetric!($LHS, $RHS, -) + _ => impl_op_dissociated!($LHS, $RHS, -) } }; } -macro_rules! impl_op_symmetric { +macro_rules! impl_op_dissociated { ($LHS:expr, $RHS:expr, $OPERATION:tt) => { match ($LHS, $RHS) { // Binary operations on arguments with the same type: From cf892fefcc6bbe6cc304a378460f7c22a668b0b4 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 14 Mar 2023 19:07:54 +0300 Subject: [PATCH 20/55] renaming macro --- datafusion/common/src/scalar.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 0570cbe851a5..36138c8130d6 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -553,7 +553,7 @@ macro_rules! unsigned_subtraction_error { macro_rules! impl_op { ($LHS:expr, $RHS:expr, +) => { - impl_op_dissociated!($LHS, $RHS, +) + impl_op_arithmetic!($LHS, $RHS, +) }; ($LHS:expr, $RHS:expr, -) => { match ($LHS, $RHS) { @@ -611,12 +611,12 @@ macro_rules! impl_op { tz_rhs, IntervalMode::Nano, ), - _ => impl_op_dissociated!($LHS, $RHS, -) + _ => impl_op_arithmetic!($LHS, $RHS, -) } }; } -macro_rules! impl_op_dissociated { +macro_rules! impl_op_arithmetic { ($LHS:expr, $RHS:expr, $OPERATION:tt) => { match ($LHS, $RHS) { // Binary operations on arguments with the same type: From a078dbbc5f840fd4b58ed6c07b5fa2672054e2b1 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 20 Mar 2023 15:49:33 +0300 Subject: [PATCH 21/55] ok till arrow kernel ops --- datafusion/core/tests/sql/mod.rs | 59 ++++++ datafusion/core/tests/sql/timestamp.rs | 22 +++ datafusion/expr/src/type_coercion/binary.rs | 55 +++++- .../physical-expr/src/expressions/datetime.rs | 168 +++++++++++++----- datafusion/physical-expr/src/planner.rs | 21 +++ 5 files changed, 278 insertions(+), 47 deletions(-) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 8b810737cfd5..1a6959846f1c 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -1283,6 +1283,65 @@ where Ok(Arc::new(table)) } +fn make_timestamp_sub_table() -> Result> +where + A: ArrowTimestampType, +{ + make_timestamp_tz_sub_table::(None) +} + +fn make_timestamp_tz_sub_table(tz: Option) -> Result> +where + A: ArrowTimestampType, +{ + let schema = Arc::new(Schema::new(vec![ + Field::new( + "ts1", + DataType::Timestamp(A::get_time_unit(), tz.clone()), + false, + ), + Field::new( + "ts2", + DataType::Timestamp(A::get_time_unit(), tz.clone()), + false, + ), + Field::new("val", DataType::Int32, true), + ])); + + let divisor = match A::get_time_unit() { + TimeUnit::Nanosecond => 1, + TimeUnit::Microsecond => 1000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Second => 1_000_000_000, + }; + + let timestamps1 = vec![ + 1678892420_000_000_000i64 / divisor, + 1678892410_000_000_000i64 / divisor, + 1678892430_000_000_000i64 / divisor, + ]; + let timestamps2 = vec![ + 1678892400_000_000_000i64 / divisor, + 1678892400_000_000_000i64 / divisor, + 1678892400_000_000_000i64 / divisor, + ]; + + let array1 = + PrimitiveArray::::from_iter_values(timestamps1).with_timezone_opt(tz.clone()); + let array2 = PrimitiveArray::::from_iter_values(timestamps2).with_timezone_opt(tz); + + let data = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(array1), + Arc::new(array2), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + )?; + let table = MemTable::try_new(schema, vec![vec![data]])?; + Ok(Arc::new(table)) +} + fn make_timestamp_nano_table() -> Result> { make_timestamp_table::() } diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index 128ee1639e3f..7d6bf99b2696 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1691,3 +1691,25 @@ async fn test_ts_dt_binary_ops() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn timestamp_sub() -> Result<()> { + let ctx = SessionContext::new(); + let table_a = make_timestamp_sub_table::()?; + ctx.register_table("table_a", table_a)?; + + let sql = "SELECT val, ts1 - ts2 AS ts_diff FROM table_a ORDER BY ts2 - ts1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+--------------------------------------------------+", + "| val | ts_diff |", + "+-----+--------------------------------------------------+", + "| 3 | 0 years 0 mons 0 days 0 hours 0 mins 30.000 secs |", + "| 1 | 0 years 0 mons 0 days 0 hours 0 mins 20.000 secs |", + "| 2 | 0 years 0 mons 0 days 0 hours 0 mins 10.000 secs |", + "+-----+--------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + return Ok(()); +} diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 5ee66837ec16..d58c6069a527 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -21,7 +21,7 @@ use crate::type_coercion::{is_date, is_interval, is_numeric, is_timestamp}; use crate::Operator; use arrow::compute::can_cast_types; use arrow::datatypes::{ - DataType, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, }; use datafusion_common::DataFusionError; use datafusion_common::Result; @@ -220,7 +220,58 @@ pub fn temporal_add_sub_coercion( return Ok(Some(lhs_type.clone())); } - // date or timestamp + date or timestamp + // timestamp + timestamp with - operator + if is_timestamp(lhs_type) && is_timestamp(rhs_type) && (*op == Operator::Minus) { + // At this stage, a timestamp can be subtracted from a timestamp only if they + // have the same type. To not lose data, second and millisecond precision + // timestamps give output in the type of `IntervalDayTime`, and microsecond + // and nanosecond precision timestamps give in the type of `IntervalMonthDayNano`. + // A nanosecond precision subtraction may result in `IntervalYearMonth` or + // `IntervalDayTime` without loss of data, however; we need to be deterministic + // while determining the type of the output. + match (lhs_type, rhs_type) { + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, _), + ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), + ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, _), + ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, _), + ) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), + ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, _), + ) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), + (_, _) => { + return Err(DataFusionError::Plan(format!( + "The timestamps have different types" + ))); + } + } + } + + // interval + interval + if is_interval(lhs_type) && is_interval(rhs_type) { + match (lhs_type, rhs_type) { + // operation with the same types + ( + DataType::Interval(IntervalUnit::YearMonth), + DataType::Interval(IntervalUnit::YearMonth), + ) => return Ok(Some(DataType::Interval(IntervalUnit::YearMonth))), + ( + DataType::Interval(IntervalUnit::DayTime), + DataType::Interval(IntervalUnit::DayTime), + ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), + // operation with MonthDayNano's or different types + (_, _) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), + } + } + + // date + date or timestamp + timestamp with + operator if (is_date(lhs_type) || is_timestamp(lhs_type)) && (is_date(rhs_type) || is_timestamp(rhs_type)) { diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 655cb07f03b1..41fd230ebcb8 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -18,12 +18,14 @@ use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; use arrow::array::{Array, ArrayRef}; -use arrow::compute::unary; +use arrow::compute::{binary, unary}; use arrow::datatypes::{ - DataType, Date32Type, Date64Type, Schema, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, + DataType, Date32Type, Date64Type, IntervalDayTimeType, Schema, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, }; use arrow::record_batch::RecordBatch; +use arrow_schema::IntervalUnit; use datafusion_common::cast::{ as_date32_array, as_date64_array, as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, @@ -31,10 +33,11 @@ use datafusion_common::cast::{ }; use datafusion_common::scalar::{ date32_add, date64_add, microseconds_add, milliseconds_add, nanoseconds_add, - seconds_add, + seconds_add, trial, ts_sub_to_interval, IntervalMode, }; use datafusion_common::Result; use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::type_coercion::is_interval; use datafusion_expr::{ColumnarValue, Operator}; use std::any::Any; use std::fmt::{Display, Formatter}; @@ -59,27 +62,29 @@ impl DateTimeIntervalExpr { rhs: Arc, input_schema: &Schema, ) -> Result { - match lhs.data_type(input_schema)? { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { - match rhs.data_type(input_schema)? { - DataType::Interval(_) => match &op { - Operator::Plus | Operator::Minus => Ok(Self { - lhs, - op, - rhs, - input_schema: input_schema.clone(), - }), - _ => Err(DataFusionError::Execution(format!( - "Invalid operator '{op}' for DateIntervalExpr" - ))), - }, - other => Err(DataFusionError::Execution(format!( - "Operation '{op}' not support for type {other}" - ))), - } - } + match ( + lhs.data_type(input_schema)?, + op, + rhs.data_type(input_schema)?, + ) { + ( + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _), + Operator::Plus | Operator::Minus, + DataType::Interval(_), + ) + | (DataType::Timestamp(_, _), Operator::Minus, DataType::Timestamp(_, _)) + | ( + DataType::Interval(_), + Operator::Plus | Operator::Minus, + DataType::Interval(_), + ) => Ok(Self { + lhs, + op, + rhs, + input_schema: input_schema.clone(), + }), other => Err(DataFusionError::Execution(format!( - "Invalid lhs type '{other}' for DateIntervalExpr" + "Invalid operation '{other:?}' for DateIntervalExpr" ))), } } @@ -112,7 +117,35 @@ impl PhysicalExpr for DateTimeIntervalExpr { } fn data_type(&self, input_schema: &Schema) -> Result { - self.lhs.data_type(input_schema) + let lhs_data_type = self.lhs.data_type(input_schema)?; + let rhs_data_type = self.rhs.data_type(input_schema)?; + + if is_interval(&lhs_data_type) && is_interval(&rhs_data_type) { + if lhs_data_type == rhs_data_type { + return Ok(lhs_data_type); + } else { + return Ok(DataType::Interval(IntervalUnit::MonthDayNano)); + } + } + match (lhs_data_type, rhs_data_type) { + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, _), + ) + | ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, _), + ) => Ok(DataType::Interval(IntervalUnit::DayTime)), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, _), + ) + | ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, _), + ) => Ok(DataType::Interval(IntervalUnit::MonthDayNano)), + (_, _) => self.lhs.data_type(input_schema), + } } fn nullable(&self, input_schema: &Schema) -> Result { @@ -120,18 +153,8 @@ impl PhysicalExpr for DateTimeIntervalExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - let dates = self.lhs.evaluate(batch)?; - let intervals = self.rhs.evaluate(batch)?; - - // Unwrap interval to add - let intervals = match &intervals { - ColumnarValue::Scalar(interval) => interval, - _ => { - let msg = "Columnar execution is not yet supported for DateIntervalExpr"; - return Err(DataFusionError::Execution(msg.to_string())); - } - }; - + let lhs_columnar = self.lhs.evaluate(batch)?; + let rhs_columnar = self.rhs.evaluate(batch)?; // Invert sign for subtraction let sign = match self.op { Operator::Plus => 1, @@ -142,14 +165,34 @@ impl PhysicalExpr for DateTimeIntervalExpr { return Err(DataFusionError::Internal(msg.to_string())); } }; - - match dates { - ColumnarValue::Scalar(operand) => Ok(ColumnarValue::Scalar(if sign > 0 { - operand.add(intervals)? - } else { - operand.sub(intervals)? - })), - ColumnarValue::Array(array) => evaluate_array(array, sign, intervals), + // RHS is first checked. If it is a Scalar, there are 2 options: + // Either LHS is also a Scalar and matching operation is applied, + // or LHS is an Array and unary operations for related types are + // applied in evaluate_array function. If RHS is an Array, then + // LHS must also be, moreover; they must be the same Timestamp type. + match &rhs_columnar { + ColumnarValue::Scalar(operand_rhs) => match lhs_columnar { + ColumnarValue::Scalar(operand_lhs) => { + Ok(ColumnarValue::Scalar(if sign > 0 { + operand_lhs.add(operand_rhs)? + } else { + operand_lhs.sub(operand_rhs)? + })) + } + ColumnarValue::Array(array_lhs) => { + evaluate_array(array_lhs, sign, operand_rhs) + } + }, + ColumnarValue::Array(array_rhs) => match lhs_columnar { + ColumnarValue::Array(array_lhs) => { + evaluate_arrays(array_lhs, sign, array_rhs) + } + _ => { + let msg = + "If RHS of the operation is an array, then LHS also must be"; + Err(DataFusionError::Internal(msg.to_string())) + } + }, } } @@ -239,6 +282,41 @@ pub fn evaluate_array( Ok(ColumnarValue::Array(ret)) } +pub fn evaluate_arrays( + array_lhs: ArrayRef, + sign: i32, + array_rhs: &ArrayRef, +) -> Result { + let err = + || DataFusionError::Execution("Overflow while evaluating arrays".to_string()); + let ret = match (array_lhs.data_type(), array_rhs.data_type()) { + ( + DataType::Timestamp(TimeUnit::Second, opt_tz_lhs), + DataType::Timestamp(TimeUnit::Second, opt_tz_rhs), + ) => { + let prim_array_lhs = as_timestamp_second_array(&array_lhs)?; + let prim_array_rhs = as_timestamp_second_array(&array_rhs)?; + Arc::new( + binary::( + prim_array_lhs, + prim_array_rhs, + |ts1: TimestampSecondType, ts2: TimestampSecondType| { + trial(ts1, ts2) + }, + ) + .unwrap(), + ) + } + (_, _) => Err(DataFusionError::Execution(format!( + "Invalid array types for DateIntervalExpr: {:?} {} {:?}", + array_lhs.data_type(), + sign, + array_rhs.data_type() + )))?, + } as ArrayRef; + Ok(ColumnarValue::Array(ret)) +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 1fbd73b3ba01..f4f7d6a40239 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -207,6 +207,27 @@ pub fn create_physical_expr( lhs, input_schema, )?)), + // Timestamp + Timestamp operations cannot reach till that point already. + ( + DataType::Timestamp(_, _), + Operator::Minus, + DataType::Timestamp(_, _), + ) => Ok(Arc::new(DateTimeIntervalExpr::try_new( + lhs, + *op, + rhs, + input_schema, + )?)), + ( + DataType::Interval(_), + Operator::Plus | Operator::Minus, + DataType::Interval(_), + ) => Ok(Arc::new(DateTimeIntervalExpr::try_new( + lhs, + *op, + rhs, + input_schema, + )?)), _ => { // Note that the logical planner is responsible // for type coercion on the arguments (e.g. if one From bbfd9b19639a1bc1356761394d3a93dcf879a575 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 22 Mar 2023 16:59:03 +0300 Subject: [PATCH 22/55] macro will replace matches inside evaluate add tests macro will replace matches inside evaluate ready for review --- datafusion-cli/Cargo.lock | 132 ++-- datafusion/common/src/cast.rs | 24 +- datafusion/common/src/scalar.rs | 266 ++++++-- datafusion/core/tests/sql/mod.rs | 213 ++++++- datafusion/core/tests/sql/timestamp.rs | 212 ++++++- datafusion/expr/src/type_coercion/binary.rs | 32 +- .../physical-expr/src/expressions/datetime.rs | 578 +++++++++++++++--- 7 files changed, 1234 insertions(+), 223 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9bf439ce87cf..88fa0137138e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -56,9 +56,9 @@ dependencies = [ [[package]] name = "arrayref" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" @@ -338,13 +338,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.66" +version = "0.1.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84f9ebcc6c1f5b8cb160f6990096a5c127f423fcb6e1ccc46c370cbdfb75dfc" +checksum = "86ea188f25f0255d8f92797797c97ebf5631fa88178beb1a46fdf5622c9a00e4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.8", ] [[package]] @@ -545,7 +545,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -680,9 +680,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a140f260e6f3f79013b8bfc65e7ce630c9ab4388c6a89c71e07226f49487b72" +checksum = "a9c00419335c41018365ddf7e4d5f1c12ee3659ddcf3e01974650ba1de73d038" dependencies = [ "cc", "cxxbridge-flags", @@ -692,9 +692,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da6383f459341ea689374bf0a42979739dc421874f112ff26f829b8040b8e613" +checksum = "fb8307ad413a98fff033c8545ecf133e3257747b3bae935e7602aab8aa92d4ca" dependencies = [ "cc", "codespan-reporting", @@ -702,24 +702,24 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 2.0.8", ] [[package]] name = "cxxbridge-flags" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90201c1a650e95ccff1c8c0bb5a343213bdd317c6e600a93075bca2eff54ec97" +checksum = "edc52e2eb08915cb12596d29d55f0b5384f00d697a646dbd269b6ecb0fbd9d31" [[package]] name = "cxxbridge-macro" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56" +checksum = "631569015d0d8d54e6c241733f944042623ab6df7bc3be7466874b05fcdb1c5f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.8", ] [[package]] @@ -1144,7 +1144,7 @@ checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1357,16 +1357,16 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "0c17cc76786e99f8d2f055c11159e7f0091c42474dcc3189fbab96072e873e6d" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] @@ -1416,9 +1416,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "io-lifetimes" -version = "1.0.7" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76e86b86ae312accbf05ade23ce76b625e0e47a255712b7414037385a1c05380" +checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" dependencies = [ "hermit-abi 0.3.1", "libc", @@ -1647,9 +1647,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" @@ -1825,9 +1825,9 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.4.1" +version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" [[package]] name = "parking_lot" @@ -1987,7 +1987,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "version_check", ] @@ -2010,9 +2010,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.52" +version = "1.0.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d0e1ae9e836cc3beddd63db0df682593d7e2d3d891ae8c9083d2113e1744224" +checksum = "ba466839c78239c09faf015484e5cc04860f88242cff4d03eb038f04b4699b73" dependencies = [ "unicode-ident", ] @@ -2098,9 +2098,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "cce168fea28d3e05f158bda4576cf0c844d5045bc2cc3620fa0292ed5bb5814c" dependencies = [ "aho-corasick", "memchr", @@ -2109,15 +2109,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "reqwest" -version = "0.11.14" +version = "0.11.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" +checksum = "0ba30cc2c0cd02af1222ed216ba659cdb2f879dfe3181852fe7c50b1d0005949" dependencies = [ "base64", "bytes", @@ -2180,9 +2180,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.9" +version = "0.36.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e" dependencies = [ "bitflags", "errno", @@ -2293,22 +2293,22 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.154" +version = "1.0.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cdd151213925e7f1ab45a9bbfb129316bd00799784b174b7cc7bcd16961c49e" +checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.154" +version = "1.0.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fc80d722935453bcafdc2c9a73cd6fac4dc1938f0346035d84bf99fa9e33217" +checksum = "e801c1712f48475582b7696ac71e0ca34ebb30e09338425384269d9717c62cad" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.8", ] [[package]] @@ -2385,7 +2385,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2428,7 +2428,7 @@ checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2465,7 +2465,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 1.0.109", ] [[package]] @@ -2485,6 +2485,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn" +version = "2.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc02725fd69ab9f26eab07fad303e2497fad6fb9eba4f96c4d1687bdf704ad9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "tempfile" version = "3.4.0" @@ -2515,22 +2526,22 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.39" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.39" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.8", ] [[package]] @@ -2595,7 +2606,7 @@ checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2660,7 +2671,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2696,9 +2707,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-bidi" -version = "0.3.11" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524b68aca1d05e03fdf03fcdce2c6c94b6daf6d16861ddaa7e4f2b6638a9052c" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" @@ -2812,7 +2823,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-shared", ] @@ -2846,7 +2857,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2930,6 +2941,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdacb41e6a96a052c6cb63a144f24900236121c6f63f4f8219fef5977ecb0c25" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.42.0" diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs index fe909e775815..b9cc429d7023 100644 --- a/datafusion/common/src/cast.rs +++ b/datafusion/common/src/cast.rs @@ -26,7 +26,8 @@ use arrow::{ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, Float64Array, GenericBinaryArray, GenericListArray, GenericStringArray, - Int32Array, Int64Array, LargeListArray, ListArray, MapArray, NullArray, + Int32Array, Int64Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, + IntervalYearMonthArray, LargeListArray, ListArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt32Array, UInt64Array, UnionArray, @@ -168,6 +169,27 @@ pub fn as_timestamp_second_array(array: &dyn Array) -> Result<&TimestampSecondAr Ok(downcast_value!(array, TimestampSecondArray)) } +// Downcast ArrayRef to IntervalYearMonthArray +pub fn as_interval_ym_array( + array: &dyn Array, +) -> Result<&IntervalYearMonthArray, DataFusionError> { + Ok(downcast_value!(array, IntervalYearMonthArray)) +} + +// Downcast ArrayRef to IntervalDayTimeArray +pub fn as_interval_dt_array( + array: &dyn Array, +) -> Result<&IntervalDayTimeArray, DataFusionError> { + Ok(downcast_value!(array, IntervalDayTimeArray)) +} + +// Downcast ArrayRef to IntervalMonthDayNanoArray +pub fn as_interval_mdn_array( + array: &dyn Array, +) -> Result<&IntervalMonthDayNanoArray, DataFusionError> { + Ok(downcast_value!(array, IntervalMonthDayNanoArray)) +} + // Downcast ArrayRef to BinaryArray pub fn as_binary_array(array: &dyn Array) -> Result<&BinaryArray> { Ok(downcast_value!(array, BinaryArray)) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index cb5bb12d1a8c..7b845257efb1 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -43,12 +43,15 @@ use arrow::{ DECIMAL128_MAX_PRECISION, }, }; -use chrono::{DateTime, Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime}; +use arrow_array::timezone::Tz; +use chrono::{DateTime, Datelike, Duration, NaiveDate, NaiveDateTime, TimeZone}; // Constants we use throughout this file: const MILLISECS_IN_ONE_DAY: i64 = 86_400_000; const NANOSECS_IN_ONE_DAY: i64 = 86_400_000_000_000; +const SECS_IN_ONE_MONTH: i64 = 2_592_000; // assuming 30 days. const MILLISECS_IN_ONE_MONTH: i64 = 2_592_000_000; // assuming 30 days. +const MICROSECS_IN_ONE_MONTH: i64 = 2_592_000_000_000; // assuming 30 days. const NANOSECS_IN_ONE_MONTH: i128 = 2_592_000_000_000_000; // assuming 30 days. /// Represents a dynamically typed, nullable single value. @@ -377,41 +380,111 @@ impl PartialOrd for ScalarValue { /// This function computes the duration (in milliseconds) of the given /// year-month-interval. #[inline] -fn ym_to_milli(val: &Option) -> Option { +pub fn ym_to_sec(val: &Option) -> Option { + val.map(|value| (value as i64) * SECS_IN_ONE_MONTH) +} + +/// This function computes the duration (in milliseconds) of the given +/// year-month-interval. +#[inline] +pub fn ym_to_milli(val: &Option) -> Option { val.map(|value| (value as i64) * MILLISECS_IN_ONE_MONTH) } +/// This function computes the duration (in milliseconds) of the given +/// year-month-interval. +#[inline] +pub fn ym_to_micro(val: &Option) -> Option { + val.map(|value| (value as i64) * MICROSECS_IN_ONE_MONTH) +} + /// This function computes the duration (in nanoseconds) of the given /// year-month-interval. #[inline] -fn ym_to_nano(val: &Option) -> Option { +pub fn ym_to_nano(val: &Option) -> Option { val.map(|value| (value as i128) * NANOSECS_IN_ONE_MONTH) } +/// This function computes the duration (in seconds) of the given +/// daytime-interval. +#[inline] +pub fn dt_to_sec(val: &Option) -> Option { + val.map(|val| { + let (days, millis) = IntervalDayTimeType::to_parts(val); + (days as i64) * MILLISECS_IN_ONE_DAY + (millis as i64 / 1_000) + }) +} + /// This function computes the duration (in milliseconds) of the given /// daytime-interval. #[inline] -fn dt_to_milli(val: &Option) -> Option { +pub fn dt_to_milli(val: &Option) -> Option { val.map(|val| { let (days, millis) = IntervalDayTimeType::to_parts(val); (days as i64) * MILLISECS_IN_ONE_DAY + (millis as i64) }) } +/// This function computes the duration (in microseconds) of the given +/// daytime-interval. +#[inline] +pub fn dt_to_micro(val: &Option) -> Option { + val.map(|val| { + let (days, millis) = IntervalDayTimeType::to_parts(val); + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + (millis as i128) * 1_000 + }) +} + /// This function computes the duration (in nanoseconds) of the given /// daytime-interval. #[inline] -fn dt_to_nano(val: &Option) -> Option { +pub fn dt_to_nano(val: &Option) -> Option { val.map(|val| { let (days, millis) = IntervalDayTimeType::to_parts(val); (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + (millis as i128) * 1_000_000 }) } +/// This function computes the duration (in seconds) of the given +/// month-day-nano-interval. Assumes a month is 30 days long. +#[inline] +pub fn mdn_to_sec(val: &Option) -> Option { + val.map(|val| { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(val); + (months as i128) * NANOSECS_IN_ONE_MONTH + + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + + (nanos as i128) / 1_000_000_000 + }) +} + +/// This function computes the duration (in milliseconds) of the given +/// month-day-nano-interval. Assumes a month is 30 days long. +#[inline] +pub fn mdn_to_milli(val: &Option) -> Option { + val.map(|val| { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(val); + (months as i128) * NANOSECS_IN_ONE_MONTH + + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + + (nanos as i128) / 1_000_000 + }) +} + +/// This function computes the duration (in microseconds) of the given +/// month-day-nano-interval. Assumes a month is 30 days long. +#[inline] +pub fn mdn_to_micro(val: &Option) -> Option { + val.map(|val| { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(val); + (months as i128) * NANOSECS_IN_ONE_MONTH + + (days as i128) * (NANOSECS_IN_ONE_DAY as i128) + + (nanos as i128) / 1_000 + }) +} + /// This function computes the duration (in nanoseconds) of the given /// month-day-nano-interval. Assumes a month is 30 days long. #[inline] -fn mdn_to_nano(val: &Option) -> Option { +pub fn mdn_to_nano(val: &Option) -> Option { val.map(|val| { let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(val); (months as i128) * NANOSECS_IN_ONE_MONTH @@ -659,37 +732,27 @@ macro_rules! impl_op_arithmetic { ( ScalarValue::IntervalYearMonth(Some(lhs)), ScalarValue::IntervalYearMonth(Some(rhs)), - ) => Ok(ScalarValue::new_interval_ym( - 0, - lhs + rhs * get_sign!($OPERATION), - )), + ) => Ok(ScalarValue::IntervalYearMonth(Some(op_ym( + *lhs, + *rhs, + get_sign!($OPERATION), + )))), ( ScalarValue::IntervalDayTime(Some(lhs)), ScalarValue::IntervalDayTime(Some(rhs)), - ) => { - let sign = get_sign!($OPERATION); - let (lhs_days, lhs_millis) = IntervalDayTimeType::to_parts(*lhs); - let (rhs_days, rhs_millis) = IntervalDayTimeType::to_parts(*rhs); - Ok(ScalarValue::new_interval_dt( - lhs_days + rhs_days * sign, - lhs_millis + rhs_millis * sign, - )) - } + ) => Ok(ScalarValue::IntervalDayTime(Some(op_dt( + *lhs, + *rhs, + get_sign!($OPERATION), + )))), ( ScalarValue::IntervalMonthDayNano(Some(lhs)), ScalarValue::IntervalMonthDayNano(Some(rhs)), - ) => { - let sign = get_sign!($OPERATION); - let (lhs_months, lhs_days, lhs_nanos) = - IntervalMonthDayNanoType::to_parts(*lhs); - let (rhs_months, rhs_days, rhs_nanos) = - IntervalMonthDayNanoType::to_parts(*rhs); - Ok(ScalarValue::new_interval_mdn( - lhs_months + rhs_months * sign, - lhs_days + rhs_days * sign, - lhs_nanos + rhs_nanos * (sign as i64), - )) - } + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_mdn( + *lhs, + *rhs, + get_sign!($OPERATION), + )))), // Binary operations on arguments with different types: (ScalarValue::Date32(Some(days)), _) => { let value = date32_add(*days, $RHS, get_sign!($OPERATION))?; @@ -734,27 +797,57 @@ macro_rules! impl_op_arithmetic { ( ScalarValue::IntervalYearMonth(Some(lhs)), ScalarValue::IntervalDayTime(Some(rhs)), - ) => op_ym_dt(*lhs, *rhs, get_sign!($OPERATION), false), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_ym_dt( + *lhs, + *rhs, + get_sign!($OPERATION), + false, + )))), ( ScalarValue::IntervalYearMonth(Some(lhs)), ScalarValue::IntervalMonthDayNano(Some(rhs)), - ) => op_ym_mdn(*lhs, *rhs, get_sign!($OPERATION), false), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_ym_mdn( + *lhs, + *rhs, + get_sign!($OPERATION), + false, + )))), ( ScalarValue::IntervalDayTime(Some(lhs)), ScalarValue::IntervalYearMonth(Some(rhs)), - ) => op_ym_dt(*rhs, *lhs, get_sign!($OPERATION), true), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_ym_dt( + *rhs, + *lhs, + get_sign!($OPERATION), + true, + )))), ( ScalarValue::IntervalDayTime(Some(lhs)), ScalarValue::IntervalMonthDayNano(Some(rhs)), - ) => op_dt_mdn(*lhs, *rhs, get_sign!($OPERATION), false), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_dt_mdn( + *lhs, + *rhs, + get_sign!($OPERATION), + false, + )))), ( ScalarValue::IntervalMonthDayNano(Some(lhs)), ScalarValue::IntervalYearMonth(Some(rhs)), - ) => op_ym_mdn(*rhs, *lhs, get_sign!($OPERATION), true), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_ym_mdn( + *rhs, + *lhs, + get_sign!($OPERATION), + true, + )))), ( ScalarValue::IntervalMonthDayNano(Some(lhs)), ScalarValue::IntervalDayTime(Some(rhs)), - ) => op_dt_mdn(*rhs, *lhs, get_sign!($OPERATION), true), + ) => Ok(ScalarValue::IntervalMonthDayNano(Some(op_dt_mdn( + *rhs, + *lhs, + get_sign!($OPERATION), + true, + )))), _ => Err(DataFusionError::Internal(format!( "Operator {} is not implemented for types {:?} and {:?}", stringify!($OPERATION), @@ -768,10 +861,10 @@ macro_rules! impl_op_arithmetic { /// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of different /// types ([`IntervalYearMonthType`] and [`IntervalDayTimeType`], respectively). /// The argument `sign` chooses between addition and subtraction, the argument -/// `commute` swaps `lhs` and `rhs`. The return value is an interval [`ScalarValue`] -/// with type data type [`IntervalMonthDayNanoType`]. +/// `commute` swaps `lhs` and `rhs`. The return value is an 128-bit integer. +/// It can be involved in a [`IntervalMonthDayNanoType`] in the outer scope. #[inline] -fn op_ym_dt(mut lhs: i32, rhs: i64, sign: i32, commute: bool) -> Result { +pub fn op_ym_dt(mut lhs: i32, rhs: i64, sign: i32, commute: bool) -> i128 { let (mut days, millis) = IntervalDayTimeType::to_parts(rhs); let mut nanos = (millis as i64) * 1_000_000; if commute { @@ -780,16 +873,16 @@ fn op_ym_dt(mut lhs: i32, rhs: i64, sign: i32, commute: bool) -> Result Result { +pub fn op_ym_mdn(lhs: i32, rhs: i128, sign: i32, commute: bool) -> i128 { let (mut months, mut days, mut nanos) = IntervalMonthDayNanoType::to_parts(rhs); if commute { months += lhs * sign; @@ -798,20 +891,19 @@ fn op_ym_mdn(lhs: i32, rhs: i128, sign: i32, commute: bool) -> Result Result { +pub fn op_dt_mdn(lhs: i64, rhs: i128, sign: i32, commute: bool) -> i128 { let (lhs_days, lhs_millis) = IntervalDayTimeType::to_parts(lhs); let (rhs_months, rhs_days, rhs_nanos) = IntervalMonthDayNanoType::to_parts(rhs); - - let result = if commute { + if commute { IntervalMonthDayNanoType::make_value( rhs_months, lhs_days * sign + rhs_days, @@ -823,8 +915,45 @@ fn op_dt_mdn(lhs: i64, rhs: i128, sign: i32, commute: bool) -> Result i32 { + lhs + rhs * sign +} + +/// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of +/// the same type [`IntervalDayTimeType`]. The argument `sign` chooses between +/// addition and subtraction. The return value is an 64-bit integer. It can be +/// involved in a [`IntervalDayTimeType`] in the outer scope. +#[inline] +pub fn op_dt(lhs: i64, rhs: i64, sign: i32) -> i64 { + let (lhs_days, lhs_millis) = IntervalDayTimeType::to_parts(lhs); + let (rhs_days, rhs_millis) = IntervalDayTimeType::to_parts(rhs); + IntervalDayTimeType::make_value( + lhs_days + rhs_days * sign, + lhs_millis + rhs_millis * sign, + ) +} + +/// This function adds/subtracts two "raw" intervals (`lhs` and `rhs`) of +/// the same type [`IntervalMonthDayNanoType`]. The argument `sign` chooses between +/// addition and subtraction. The return value is an 128-bit integer. It can be +/// involved in a [`IntervalMonthDayNanoType`] in the outer scope. +#[inline] +pub fn op_mdn(lhs: i128, rhs: i128, sign: i32) -> i128 { + let (lhs_months, lhs_days, lhs_nanos) = IntervalMonthDayNanoType::to_parts(lhs); + let (rhs_months, rhs_days, rhs_nanos) = IntervalMonthDayNanoType::to_parts(rhs); + IntervalMonthDayNanoType::make_value( + lhs_months + rhs_months * sign, + lhs_days + rhs_days * sign, + lhs_nanos + rhs_nanos * (sign as i64), + ) } macro_rules! get_sign { @@ -837,7 +966,7 @@ macro_rules! get_sign { } #[derive(Clone, Copy)] -enum IntervalMode { +pub enum IntervalMode { Milli, Nano, } @@ -887,7 +1016,7 @@ fn ts_sub_to_interval( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] -fn with_timezone_to_naive_datetime( +pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, mode: IntervalMode, @@ -970,6 +1099,35 @@ pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result i64 { + let diff_ms = (ts_lhs - ts_rhs) * 1000; + let days = (diff_ms / MILLISECS_IN_ONE_DAY) as i32; + let millis = (diff_ms % MILLISECS_IN_ONE_DAY) as i32; + IntervalDayTimeType::make_value(days, millis) +} +#[inline] +pub fn milliseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i64 { + let diff_ms = ts_lhs - ts_rhs; + let days = (diff_ms / MILLISECS_IN_ONE_DAY) as i32; + let millis = (diff_ms % MILLISECS_IN_ONE_DAY) as i32; + IntervalDayTimeType::make_value(days, millis) +} +#[inline] +pub fn microseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i128 { + let diff_ns = (ts_lhs - ts_rhs) * 1000; + let days = (diff_ns / NANOSECS_IN_ONE_DAY) as i32; + let nanos = diff_ns % NANOSECS_IN_ONE_DAY; + IntervalMonthDayNanoType::make_value(0, days, nanos) +} +#[inline] +pub fn nanoseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i128 { + let diff_ns = ts_lhs - ts_rhs; + let days = (diff_ns / NANOSECS_IN_ONE_DAY) as i32; + let nanos = diff_ns % NANOSECS_IN_ONE_DAY; + IntervalMonthDayNanoType::make_value(0, days, nanos) +} + #[inline] fn do_date_time_math( secs: i64, diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index c0b387a1aa25..f70a06adb38b 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -1363,22 +1363,25 @@ fn make_timestamp_sub_table() -> Result> where A: ArrowTimestampType, { - make_timestamp_tz_sub_table::(None) + make_timestamp_tz_sub_table::(None, None) } -fn make_timestamp_tz_sub_table(tz: Option) -> Result> +fn make_timestamp_tz_sub_table( + tz1: Option, + tz2: Option, +) -> Result> where A: ArrowTimestampType, { let schema = Arc::new(Schema::new(vec![ Field::new( "ts1", - DataType::Timestamp(A::get_time_unit(), tz.clone()), + DataType::Timestamp(A::get_time_unit(), tz1.clone()), false, ), Field::new( "ts2", - DataType::Timestamp(A::get_time_unit(), tz.clone()), + DataType::Timestamp(A::get_time_unit(), tz2.clone()), false, ), Field::new("val", DataType::Int32, true), @@ -1392,25 +1395,211 @@ where }; let timestamps1 = vec![ - 1678892420_000_000_000i64 / divisor, - 1678892410_000_000_000i64 / divisor, - 1678892430_000_000_000i64 / divisor, + 1_678_892_420_000_000_000i64 / divisor, //2023-03-15T15:00:20.000_000_000 + 1_678_892_410_000_000_000i64 / divisor, //2023-03-15T15:00:10.000_000_000 + 1_678_892_430_000_000_000i64 / divisor, //2023-03-15T15:00:30.000_000_000 ]; let timestamps2 = vec![ - 1678892400_000_000_000i64 / divisor, - 1678892400_000_000_000i64 / divisor, - 1678892400_000_000_000i64 / divisor, + 1_678_892_400_000_000_000i64 / divisor, //2023-03-15T15:00:00.000_000_000 + 1_678_892_400_000_000_000i64 / divisor, //2023-03-15T15:00:00.000_000_000 + 1_678_892_400_000_000_000i64 / divisor, //2023-03-15T15:00:00.000_000_000 ]; let array1 = - PrimitiveArray::::from_iter_values(timestamps1).with_timezone_opt(tz.clone()); - let array2 = PrimitiveArray::::from_iter_values(timestamps2).with_timezone_opt(tz); + PrimitiveArray::::from_iter_values(timestamps1).with_timezone_opt(tz1); + let array2 = + PrimitiveArray::::from_iter_values(timestamps2).with_timezone_opt(tz2); + + let data = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(array1), + Arc::new(array2), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + )?; + let table = MemTable::try_new(schema, vec![vec![data]])?; + Ok(Arc::new(table)) +} + +fn make_ts_interval_table() -> Result> { + let schema = Arc::new(Schema::new(vec![ + Field::new( + "ts_sec1", + DataType::Timestamp(TimeUnit::Second, None), + false, + ), + Field::new( + "ts_sec2", + DataType::Timestamp(TimeUnit::Second, None), + false, + ), + Field::new( + "ts_millisec1", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new( + "ts_millisec2", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new( + "ts_microsec1", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + Field::new( + "ts_microsec2", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + Field::new( + "ts_nanosec1", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new( + "ts_nanosec2", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new( + "interval_ym1", + DataType::Interval(IntervalUnit::YearMonth), + false, + ), + Field::new( + "interval_ym2", + DataType::Interval(IntervalUnit::YearMonth), + false, + ), + Field::new( + "interval_dt1", + DataType::Interval(IntervalUnit::DayTime), + false, + ), + Field::new( + "interval_dt2", + DataType::Interval(IntervalUnit::DayTime), + false, + ), + Field::new( + "interval_mdn1", + DataType::Interval(IntervalUnit::MonthDayNano), + false, + ), + Field::new( + "interval_mdn2", + DataType::Interval(IntervalUnit::MonthDayNano), + false, + ), + Field::new("val", DataType::Int32, true), + ])); + + let ts_sec1 = vec![ + 1_678_893_420i64, //2023-03-15T15:17:00 + 1_688_894_420i64, //2023-07-09T09:20:20 + 1_698_895_420i64, //2023-11-02T03:23:40 + ]; + let ts_sec2 = vec![ + 1_678_892_420i64, //2023-03-15T15:00:20 + 1_688_892_420i64, //2023-07-09T08:47:00 + 1_698_892_420i64, //2023-11-02T02:33:40 + ]; + let ts_millisec1 = vec![ + 1_678_892_420_002i64, //2023-03-15T15:00:20.002 + 1_688_892_420_001i64, //2023-07-09T08:47:00.001 + 1_698_892_420_003i64, //2023-11-02T02:33:40.003 + ]; + let ts_millisec2 = vec![ + 1_678_892_420_000i64, //2023-03-15T15:00:20.000 + 1_688_892_420_000i64, //2023-07-09T08:47:00.000 + 1_698_892_420_000i64, //2023-11-02T02:33:40.000 + ]; + let ts_microsec1 = vec![ + 1_678_892_420_002_010i64, //2023-03-15T15:00:20.002_010 + 1_688_892_420_001_020i64, //2023-07-09T08:47:00.001_020 + 1_698_892_420_003_030i64, //2023-11-02T02:33:40.003_030 + ]; + let ts_microsec2 = vec![ + 1_678_892_420_000_000i64, //2023-03-15T15:00:20.000_000 + 1_688_892_420_000_000i64, //2023-07-09T08:47:00.000_000 + 1_698_892_420_000_000i64, //2023-11-02T02:33:40.000_000 + ]; + let ts_nanosec1 = vec![ + 1_678_892_420_002_200_002i64, //2023-03-15T15:00:20.002_200_002 + 1_688_892_420_001_500_004i64, //2023-07-09T08:47:00.001_500_004 + 1_698_892_420_003_300_003i64, //2023-11-02T02:33:40.003_300_003 + ]; + let ts_nanosec2 = vec![ + 1_678_892_420_000_000_000i64, //2023-03-15T15:00:20.000_000_000 + 1_688_892_420_000_000_000i64, //2023-07-09T08:47:00.000_000_000 + 1_698_892_420_000_000_000i64, //2023-11-02T02:33:40.000_000_000 + ]; + + let intervals_ym1 = vec![11, 8, 23]; + // 11 months, 8 months, 23 months + let intervals_ym2 = vec![3, 7, 5]; + // 3 months, 7 months, 5 months + let intervals_dt1 = vec![4_394_969_299, 4_494_969_298, 4_594_969_297]; + // 1 day 27 hours 46 minutes 42 seconds 3 milliseconds + // 1 day 55 hours 33 minutes 22 seconds 2 milliseconds + // 1 day 83 hours 20 minutes 2 seconds 1 millisecond + let intervals_dt2 = vec![4_294_969_296, 4_294_969_296, 4_294_969_296]; + // 1 day 2 seconds , 1 day 2 seconds , 1 day 2 seconds + let intervals_mdn1 = vec![ + 237_684_487_801_047_429_812_565_639_167, + 396_140_812_755_789_128_704_815_267_841, + 18_446_744_073_709_551_907, + ]; + // 3 months 14 days 65535 nanoseconds + // 5 months 10 days 1 nanosecond + // 0 month 1 day 291 nanoseconds + let intervals_mdn2 = vec![100, 1_000_000, 1_000_000_000]; + // 100 nanoseconds + // 1_000_000 nanoseconds + // 1_000_000_000 nanoseconds + + let array1 = PrimitiveArray::::from_iter_values(ts_sec1); + let array2 = PrimitiveArray::::from_iter_values(ts_sec2); + let array3 = + PrimitiveArray::::from_iter_values(ts_millisec1); + let array4 = + PrimitiveArray::::from_iter_values(ts_millisec2); + let array5 = + PrimitiveArray::::from_iter_values(ts_microsec1); + let array6 = + PrimitiveArray::::from_iter_values(ts_microsec2); + let array7 = PrimitiveArray::::from_iter_values(ts_nanosec1); + let array8 = PrimitiveArray::::from_iter_values(ts_nanosec2); + let array9 = PrimitiveArray::::from_iter_values(intervals_ym1); + let array10 = + PrimitiveArray::::from_iter_values(intervals_ym2); + let array11 = PrimitiveArray::::from_iter_values(intervals_dt1); + let array12 = PrimitiveArray::::from_iter_values(intervals_dt2); + let array13 = + PrimitiveArray::::from_iter_values(intervals_mdn1); + let array14 = + PrimitiveArray::::from_iter_values(intervals_mdn2); let data = RecordBatch::try_new( schema.clone(), vec![ Arc::new(array1), Arc::new(array2), + Arc::new(array3), + Arc::new(array4), + Arc::new(array5), + Arc::new(array6), + Arc::new(array7), + Arc::new(array8), + Arc::new(array9), + Arc::new(array10), + Arc::new(array11), + Arc::new(array12), + Arc::new(array13), + Arc::new(array14), Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), ], )?; diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index 7d6bf99b2696..364c7da5945e 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1693,7 +1693,7 @@ async fn test_ts_dt_binary_ops() -> Result<()> { } #[tokio::test] -async fn timestamp_sub() -> Result<()> { +async fn timestamp_sub_simple() -> Result<()> { let ctx = SessionContext::new(); let table_a = make_timestamp_sub_table::()?; ctx.register_table("table_a", table_a)?; @@ -1711,5 +1711,213 @@ async fn timestamp_sub() -> Result<()> { ]; assert_batches_eq!(expected, &actual); - return Ok(()); + Ok(()) +} + +#[tokio::test] +async fn timestamp_sub_with_tz() -> Result<()> { + let ctx = SessionContext::new(); + let table_a = make_timestamp_tz_sub_table::( + Some("America/Los_Angeles".to_string()), + Some("Europe/Istanbul".to_string()), + )?; + ctx.register_table("table_a", table_a)?; + + let sql = "SELECT val, ts1 - ts2 AS ts_diff FROM table_a ORDER BY ts2 - ts1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------------------------------------+", + "| val | ts_diff |", + "+-----+---------------------------------------------------+", + "| 3 | 0 years 0 mons 0 days 10 hours 0 mins 30.000 secs |", + "| 1 | 0 years 0 mons 0 days 10 hours 0 mins 20.000 secs |", + "| 2 | 0 years 0 mons 0 days 10 hours 0 mins 10.000 secs |", + "+-----+---------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + Ok(()) +} + +#[tokio::test] +async fn interval_sub() -> Result<()> { + let ctx = SessionContext::new(); + let table_a = make_ts_interval_table()?; + ctx.register_table("table_a", table_a)?; + + let sql = "SELECT val, interval_dt1 - interval_dt2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+----------------------------------------------------+", + "| val | interval_diff |", + "+-----+----------------------------------------------------+", + "| 3 | 0 years 0 mons 0 days 83 hours 20 mins 0.001 secs |", + "| 2 | 0 years 0 mons 0 days 55 hours 33 mins 20.002 secs |", + "| 1 | 0 years 0 mons 0 days 27 hours 46 mins 40.003 secs |", + "+-----+----------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_ym1 - interval_ym2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+------------------------------------------------+", + "| val | interval_diff |", + "+-----+------------------------------------------------+", + "| 3 | 1 years 6 mons 0 days 0 hours 0 mins 0.00 secs |", + "| 2 | 0 years 1 mons 0 days 0 hours 0 mins 0.00 secs |", + "| 1 | 0 years 8 mons 0 days 0 hours 0 mins 0.00 secs |", + "+-----+------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_mdn1 - interval_mdn2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------------------------------------------+", + "| val | interval_diff |", + "+-----+---------------------------------------------------------+", + "| 3 | 0 years 0 mons 1 days 0 hours 0 mins -0.999999709 secs |", + "| 2 | 0 years 5 mons 10 days 0 hours 0 mins -0.000999999 secs |", + "| 1 | 0 years 3 mons 14 days 0 hours 0 mins 0.000065435 secs |", + "+-----+---------------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_ym1 - interval_dt2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+----------------------------------------------------------+", + "| val | interval_diff |", + "+-----+----------------------------------------------------------+", + "| 3 | 0 years 23 mons -1 days 0 hours 0 mins -2.000000000 secs |", + "| 2 | 0 years 8 mons -1 days 0 hours 0 mins -2.000000000 secs |", + "| 1 | 0 years 11 mons -1 days 0 hours 0 mins -2.000000000 secs |", + "+-----+----------------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + Ok(()) +} + +#[tokio::test] +async fn ts_interval_sub() -> Result<()> { + let ctx = SessionContext::new(); + let table_a = make_ts_interval_table()?; + ctx.register_table("table_a", table_a)?; + + let sql = "SELECT val, ts_millisec1 - interval_dt1 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+-------------------------+", + "| val | ts_interval_diff |", + "+-----+-------------------------+", + "| 3 | 2023-10-28T15:13:38.002 |", + "| 2 | 2023-07-06T01:13:37.999 |", + "| 1 | 2023-03-13T11:13:37.999 |", + "+-----+-------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, ts_sec1 - interval_ym1 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------+", + "| val | ts_interval_diff |", + "+-----+---------------------+", + "| 3 | 2021-12-12T03:23:40 |", + "| 2 | 2022-11-11T09:20:20 |", + "| 1 | 2022-04-19T15:17:00 |", + "+-----+---------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, ts_sec1 - interval_mdn2 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------+", + "| val | ts_interval_diff |", + "+-----+---------------------+", + "| 3 | 2023-11-02T03:23:39 |", + "| 2 | 2023-07-09T09:20:20 |", + "| 1 | 2023-03-15T15:17:00 |", + "+-----+---------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, ts_nanosec1 - interval_dt2 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+-------------------------------+", + "| val | ts_interval_diff |", + "+-----+-------------------------------+", + "| 3 | 2023-11-01T02:33:38.003300003 |", + "| 2 | 2023-07-08T08:46:58.001500004 |", + "| 1 | 2023-03-14T15:00:18.002200002 |", + "+-----+-------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + Ok(()) +} + +#[tokio::test] +async fn interval_ts_add() -> Result<()> { + let ctx = SessionContext::new(); + let table_a = make_ts_interval_table()?; + ctx.register_table("table_a", table_a)?; + + let sql = "SELECT val, interval_dt1 + ts_millisec2 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+-------------------------+", + "| val | interval_sum_ts |", + "+-----+-------------------------+", + "| 3 | 2023-11-06T13:53:42.001 |", + "| 2 | 2023-07-12T16:20:22.002 |", + "| 1 | 2023-03-17T18:47:02.003 |", + "+-----+-------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_ym2 + ts_sec1 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------+", + "| val | interval_sum_ts |", + "+-----+---------------------+", + "| 3 | 2024-03-31T03:23:40 |", + "| 2 | 2024-02-04T09:20:20 |", + "| 1 | 2023-06-13T15:17:00 |", + "+-----+---------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_mdn2 + ts_sec2 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+---------------------+", + "| val | interval_sum_ts |", + "+-----+---------------------+", + "| 3 | 2023-11-02T02:33:41 |", + "| 2 | 2023-07-09T08:47:00 |", + "| 1 | 2023-03-15T15:00:20 |", + "+-----+---------------------+", + ]; + assert_batches_eq!(expected, &actual); + + let sql = "SELECT val, interval_mdn1 + ts_nanosec1 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+-------------------------------+", + "| val | interval_sum_ts |", + "+-----+-------------------------------+", + "| 3 | 2023-11-03T02:33:40.003300294 |", + "| 2 | 2023-12-16T08:47:00.001500005 |", + "| 1 | 2023-06-27T15:00:20.002265537 |", + "+-----+-------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + + Ok(()) } diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index d58c6069a527..706c79994a88 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -113,11 +113,17 @@ pub fn coerce_types( | Operator::Gt | Operator::GtEq | Operator::LtEq => comparison_coercion(lhs_type, rhs_type), + // interval - timestamp is an erroneous case, cannot coerce a type Operator::Plus | Operator::Minus - if is_date(lhs_type) + if (is_date(lhs_type) || is_date(rhs_type) || is_timestamp(lhs_type) - || is_timestamp(rhs_type) => + || is_timestamp(rhs_type) + || is_interval(lhs_type) + || is_interval(rhs_type)) + && (!is_interval(lhs_type) + || !is_timestamp(rhs_type) + || *op != Operator::Minus) => { temporal_add_sub_coercion(lhs_type, rhs_type, op)? } @@ -215,12 +221,12 @@ pub fn temporal_add_sub_coercion( return Ok(Some(rhs_type.clone())); } - // date or timestamp + interval + // date or timestamp + - interval if is_interval(rhs_type) && (is_date(lhs_type) || is_timestamp(lhs_type)) { return Ok(Some(lhs_type.clone())); } - // timestamp + timestamp with - operator + // timestamp - timestamp if is_timestamp(lhs_type) && is_timestamp(rhs_type) && (*op == Operator::Minus) { // At this stage, a timestamp can be subtracted from a timestamp only if they // have the same type. To not lose data, second and millisecond precision @@ -233,23 +239,23 @@ pub fn temporal_add_sub_coercion( ( DataType::Timestamp(TimeUnit::Second, _), DataType::Timestamp(TimeUnit::Second, _), - ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), - ( + ) + | ( DataType::Timestamp(TimeUnit::Millisecond, _), DataType::Timestamp(TimeUnit::Millisecond, _), ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), ( DataType::Timestamp(TimeUnit::Microsecond, _), DataType::Timestamp(TimeUnit::Microsecond, _), - ) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), - ( + ) + | ( DataType::Timestamp(TimeUnit::Nanosecond, _), DataType::Timestamp(TimeUnit::Nanosecond, _), ) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), (_, _) => { - return Err(DataFusionError::Plan(format!( - "The timestamps have different types" - ))); + return Err(DataFusionError::Plan( + "The timestamps have different types".to_string(), + )); } } } @@ -992,11 +998,11 @@ mod tests { let err = coerce_types( &DataType::Timestamp(TimeUnit::Nanosecond, None), &Operator::Minus, - &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DataType::Timestamp(TimeUnit::Millisecond, None), ) .unwrap_err() .to_string(); - assert_contains!(&err, "'Timestamp(Nanosecond, None) - Timestamp(Nanosecond, None)' is an unsupported operation. addition/subtraction on dates/timestamps only supported with interval types"); + assert_contains!(&err, "The timestamps have different types"); let err = coerce_types(&DataType::Date32, &Operator::Plus, &DataType::Date64) .unwrap_err() diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 41fd230ebcb8..3de61c16bebc 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -20,24 +20,18 @@ use crate::PhysicalExpr; use arrow::array::{Array, ArrayRef}; use arrow::compute::{binary, unary}; use arrow::datatypes::{ - DataType, Date32Type, Date64Type, IntervalDayTimeType, Schema, TimeUnit, + ArrowNativeTypeOp, DataType, Date32Type, Date64Type, IntervalDayTimeType, + IntervalMonthDayNanoType, IntervalYearMonthType, Schema, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; use arrow::record_batch::RecordBatch; use arrow_schema::IntervalUnit; -use datafusion_common::cast::{ - as_date32_array, as_date64_array, as_timestamp_microsecond_array, - as_timestamp_millisecond_array, as_timestamp_nanosecond_array, - as_timestamp_second_array, -}; -use datafusion_common::scalar::{ - date32_add, date64_add, microseconds_add, milliseconds_add, nanoseconds_add, - seconds_add, trial, ts_sub_to_interval, IntervalMode, -}; +use datafusion_common::cast::*; +use datafusion_common::scalar::*; use datafusion_common::Result; use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_expr::type_coercion::is_interval; +use datafusion_expr::type_coercion::binary::coerce_types; use datafusion_expr::{ColumnarValue, Operator}; use std::any::Any; use std::fmt::{Display, Formatter}; @@ -117,35 +111,11 @@ impl PhysicalExpr for DateTimeIntervalExpr { } fn data_type(&self, input_schema: &Schema) -> Result { - let lhs_data_type = self.lhs.data_type(input_schema)?; - let rhs_data_type = self.rhs.data_type(input_schema)?; - - if is_interval(&lhs_data_type) && is_interval(&rhs_data_type) { - if lhs_data_type == rhs_data_type { - return Ok(lhs_data_type); - } else { - return Ok(DataType::Interval(IntervalUnit::MonthDayNano)); - } - } - match (lhs_data_type, rhs_data_type) { - ( - DataType::Timestamp(TimeUnit::Second, _), - DataType::Timestamp(TimeUnit::Second, _), - ) - | ( - DataType::Timestamp(TimeUnit::Millisecond, _), - DataType::Timestamp(TimeUnit::Millisecond, _), - ) => Ok(DataType::Interval(IntervalUnit::DayTime)), - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - DataType::Timestamp(TimeUnit::Microsecond, _), - ) - | ( - DataType::Timestamp(TimeUnit::Nanosecond, _), - DataType::Timestamp(TimeUnit::Nanosecond, _), - ) => Ok(DataType::Interval(IntervalUnit::MonthDayNano)), - (_, _) => self.lhs.data_type(input_schema), - } + coerce_types( + &self.lhs.data_type(input_schema)?, + &Operator::Minus, + &self.rhs.data_type(input_schema)?, + ) } fn nullable(&self, input_schema: &Schema) -> Result { @@ -153,8 +123,8 @@ impl PhysicalExpr for DateTimeIntervalExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - let lhs_columnar = self.lhs.evaluate(batch)?; - let rhs_columnar = self.rhs.evaluate(batch)?; + let lhs_value = self.lhs.evaluate(batch)?; + let rhs_value = self.rhs.evaluate(batch)?; // Invert sign for subtraction let sign = match self.op { Operator::Plus => 1, @@ -170,29 +140,25 @@ impl PhysicalExpr for DateTimeIntervalExpr { // or LHS is an Array and unary operations for related types are // applied in evaluate_array function. If RHS is an Array, then // LHS must also be, moreover; they must be the same Timestamp type. - match &rhs_columnar { - ColumnarValue::Scalar(operand_rhs) => match lhs_columnar { - ColumnarValue::Scalar(operand_lhs) => { - Ok(ColumnarValue::Scalar(if sign > 0 { - operand_lhs.add(operand_rhs)? - } else { - operand_lhs.sub(operand_rhs)? - })) - } - ColumnarValue::Array(array_lhs) => { - evaluate_array(array_lhs, sign, operand_rhs) - } - }, - ColumnarValue::Array(array_rhs) => match lhs_columnar { - ColumnarValue::Array(array_lhs) => { - evaluate_arrays(array_lhs, sign, array_rhs) - } - _ => { - let msg = - "If RHS of the operation is an array, then LHS also must be"; - Err(DataFusionError::Internal(msg.to_string())) - } - }, + match (lhs_value, rhs_value) { + (ColumnarValue::Scalar(operand_lhs), ColumnarValue::Scalar(operand_rhs)) => { + Ok(ColumnarValue::Scalar(if sign > 0 { + operand_lhs.add(&operand_rhs)? + } else { + operand_lhs.sub(&operand_rhs)? + })) + } + (ColumnarValue::Array(array_lhs), ColumnarValue::Scalar(operand_rhs)) => { + evaluate_array(array_lhs, sign, &operand_rhs) + } + + (ColumnarValue::Array(array_lhs), ColumnarValue::Array(array_rhs)) => { + evaluate_arrays(&array_lhs, sign, &array_rhs) + } + (_, _) => { + let msg = "If RHS of the operation is an array, then LHS also must be"; + Err(DataFusionError::Internal(msg.to_string())) + } } } @@ -282,30 +248,99 @@ pub fn evaluate_array( Ok(ColumnarValue::Array(ret)) } +macro_rules! ts_sub_op { + ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $mode:expr, $type_in:ty, $type_out:ty) => {{ + let prim_array_lhs = $caster(&$lhs)?; + let prim_array_rhs = $caster(&$rhs)?; + let ret = Arc::new(binary::<$type_in, $type_in, _, $type_out>( + prim_array_lhs, + prim_array_rhs, + |ts1, ts2| { + $op( + with_timezone_to_naive_datetime( + ts1.mul_wrapping($coef), + &$lhs_tz, + $mode, + ) + .expect("{ts1} timestamp cannot build a DateTime object") + .timestamp(), + with_timezone_to_naive_datetime( + ts2.mul_wrapping($coef), + &$rhs_tz, + $mode, + ) + .expect("{ts2} timestamp cannot build a DateTime object") + .timestamp(), + ) + }, + )?) as ArrayRef; + ret + }}; +} +macro_rules! interval_op { + ($lhs:ident, $rhs:ident, $caster:expr, $op:expr, $sign:ident, $type_in:ty) => {{ + let prim_array_lhs = $caster(&$lhs)?; + let prim_array_rhs = $caster(&$rhs)?; + let ret = Arc::new(binary::<$type_in, $type_in, _, $type_in>( + prim_array_lhs, + prim_array_rhs, + |interval1, interval2| $op(interval1, interval2, $sign), + )?) as ArrayRef; + ret + }}; +} +macro_rules! interval_cross_op { + ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $commute:ident, $type_in1:ty, $type_in2:ty) => {{ + let prim_array_lhs = $caster1(&$lhs)?; + let prim_array_rhs = $caster2(&$rhs)?; + let ret = Arc::new(binary::<$type_in1, $type_in2, _, IntervalMonthDayNanoType>( + prim_array_lhs, + prim_array_rhs, + |interval1, interval2| $op(interval1, interval2, $sign, $commute), + )?) as ArrayRef; + ret + }}; +} +macro_rules! ts_interval_op { + ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ + let prim_array_lhs = $caster1(&$lhs)?; + let prim_array_rhs = $caster2(&$rhs)?; + let ret = Arc::new(binary::<$type_in1, $type_in2, _, $type_in1>( + prim_array_lhs, + prim_array_rhs, + |ts, interval| { + ts.add_wrapping( + $sign as i64 + * $op(&Some(interval)) + .expect("Interval cannot computed as nanosecond") + as i64, + ) + }, + )?) as ArrayRef; + ret + }}; +} pub fn evaluate_arrays( - array_lhs: ArrayRef, + array_lhs: &ArrayRef, sign: i32, array_rhs: &ArrayRef, ) -> Result { - let err = - || DataFusionError::Execution("Overflow while evaluating arrays".to_string()); let ret = match (array_lhs.data_type(), array_rhs.data_type()) { - ( - DataType::Timestamp(TimeUnit::Second, opt_tz_lhs), - DataType::Timestamp(TimeUnit::Second, opt_tz_rhs), - ) => { - let prim_array_lhs = as_timestamp_second_array(&array_lhs)?; - let prim_array_rhs = as_timestamp_second_array(&array_rhs)?; - Arc::new( - binary::( - prim_array_lhs, - prim_array_rhs, - |ts1: TimestampSecondType, ts2: TimestampSecondType| { - trial(ts1, ts2) - }, - ) - .unwrap(), - ) + // Timestamp - Timestamp operations, operands of only the same types are supported. + (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) => { + ts_array_op(array_lhs, array_rhs)? + } + // Interval (+ , -) Interval operations + (DataType::Interval(_), DataType::Interval(_)) => { + interval_array_op(array_lhs, array_rhs, sign)? + } + // Timestamp (+ , -) Interval and Interval + Timestamp operations + // Interval - Timestamp operation is not rational hence not supported + (DataType::Timestamp(_, _), DataType::Interval(_)) => { + ts_interval_array_op(array_lhs, sign, array_rhs)? + } + (DataType::Interval(_), DataType::Timestamp(_, _)) if sign == 1 => { + ts_interval_array_op(array_rhs, sign, array_lhs)? } (_, _) => Err(DataFusionError::Execution(format!( "Invalid array types for DateIntervalExpr: {:?} {} {:?}", @@ -313,10 +348,383 @@ pub fn evaluate_arrays( sign, array_rhs.data_type() )))?, - } as ArrayRef; + }; Ok(ColumnarValue::Array(ret)) } +fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { + match (array_lhs.data_type(), array_rhs.data_type()) { + ( + DataType::Timestamp(TimeUnit::Second, opt_tz_lhs), + DataType::Timestamp(TimeUnit::Second, opt_tz_rhs), + ) => Ok(ts_sub_op!( + array_lhs, + array_rhs, + opt_tz_lhs, + opt_tz_rhs, + 1000i64, + as_timestamp_second_array, + seconds_sub, + IntervalMode::Milli, + TimestampSecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Millisecond, opt_tz_lhs), + DataType::Timestamp(TimeUnit::Millisecond, opt_tz_rhs), + ) => Ok(ts_sub_op!( + array_lhs, + array_rhs, + opt_tz_lhs, + opt_tz_rhs, + 1i64, + as_timestamp_millisecond_array, + milliseconds_sub, + IntervalMode::Milli, + TimestampMillisecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Microsecond, opt_tz_lhs), + DataType::Timestamp(TimeUnit::Microsecond, opt_tz_rhs), + ) => Ok(ts_sub_op!( + array_lhs, + array_rhs, + opt_tz_lhs, + opt_tz_rhs, + 1000i64, + as_timestamp_microsecond_array, + microseconds_sub, + IntervalMode::Nano, + TimestampMicrosecondType, + IntervalMonthDayNanoType + )), + ( + DataType::Timestamp(TimeUnit::Nanosecond, opt_tz_lhs), + DataType::Timestamp(TimeUnit::Nanosecond, opt_tz_rhs), + ) => Ok(ts_sub_op!( + array_lhs, + array_rhs, + opt_tz_lhs, + opt_tz_rhs, + 1i64, + as_timestamp_nanosecond_array, + nanoseconds_sub, + IntervalMode::Nano, + TimestampNanosecondType, + IntervalMonthDayNanoType + )), + (_, _) => Err(DataFusionError::Execution(format!( + "Invalid array types for Timestamp subtraction: {:?} - {:?}", + array_lhs.data_type(), + array_rhs.data_type() + ))), + } +} + +fn interval_array_op( + array_lhs: &ArrayRef, + array_rhs: &ArrayRef, + sign: i32, +) -> Result { + match (array_lhs.data_type(), array_rhs.data_type()) { + ( + DataType::Interval(IntervalUnit::YearMonth), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(interval_op!( + array_lhs, + array_rhs, + as_interval_ym_array, + op_ym, + sign, + IntervalYearMonthType + )), + ( + DataType::Interval(IntervalUnit::YearMonth), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(interval_cross_op!( + array_lhs, + array_rhs, + as_interval_ym_array, + as_interval_dt_array, + op_ym_dt, + sign, + false, + IntervalYearMonthType, + IntervalDayTimeType + )), + ( + DataType::Interval(IntervalUnit::YearMonth), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(interval_cross_op!( + array_lhs, + array_rhs, + as_interval_ym_array, + as_interval_mdn_array, + op_ym_mdn, + sign, + false, + IntervalYearMonthType, + IntervalMonthDayNanoType + )), + ( + DataType::Interval(IntervalUnit::DayTime), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(interval_cross_op!( + array_rhs, + array_lhs, + as_interval_ym_array, + as_interval_dt_array, + op_ym_dt, + sign, + true, + IntervalYearMonthType, + IntervalDayTimeType + )), + ( + DataType::Interval(IntervalUnit::DayTime), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(interval_op!( + array_lhs, + array_rhs, + as_interval_dt_array, + op_dt, + sign, + IntervalDayTimeType + )), + ( + DataType::Interval(IntervalUnit::DayTime), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(interval_cross_op!( + array_lhs, + array_rhs, + as_interval_dt_array, + as_interval_mdn_array, + op_dt_mdn, + sign, + false, + IntervalDayTimeType, + IntervalMonthDayNanoType + )), + ( + DataType::Interval(IntervalUnit::MonthDayNano), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(interval_cross_op!( + array_rhs, + array_lhs, + as_interval_ym_array, + as_interval_mdn_array, + op_ym_mdn, + sign, + true, + IntervalYearMonthType, + IntervalMonthDayNanoType + )), + ( + DataType::Interval(IntervalUnit::MonthDayNano), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(interval_cross_op!( + array_rhs, + array_lhs, + as_interval_dt_array, + as_interval_mdn_array, + op_dt_mdn, + sign, + true, + IntervalDayTimeType, + IntervalMonthDayNanoType + )), + ( + DataType::Interval(IntervalUnit::MonthDayNano), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(interval_op!( + array_lhs, + array_rhs, + as_interval_mdn_array, + op_mdn, + sign, + IntervalMonthDayNanoType + )), + (_, _) => Err(DataFusionError::Execution(format!( + "Invalid array types for Interval operation: {:?} {} {:?}", + array_lhs.data_type(), + sign, + array_rhs.data_type() + ))), + } +} + +fn ts_interval_array_op( + array_lhs: &ArrayRef, + sign: i32, + array_rhs: &ArrayRef, +) -> Result { + match (array_lhs.data_type(), array_rhs.data_type()) { + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_second_array, + as_interval_ym_array, + ym_to_sec, + sign, + TimestampSecondType, + IntervalYearMonthType + )), + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_second_array, + as_interval_dt_array, + dt_to_sec, + sign, + TimestampSecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_second_array, + as_interval_mdn_array, + mdn_to_sec, + sign, + TimestampSecondType, + IntervalMonthDayNanoType + )), + ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_millisecond_array, + as_interval_ym_array, + ym_to_milli, + sign, + TimestampMillisecondType, + IntervalYearMonthType + )), + ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_millisecond_array, + as_interval_dt_array, + dt_to_milli, + sign, + TimestampMillisecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_millisecond_array, + as_interval_mdn_array, + mdn_to_milli, + sign, + TimestampMillisecondType, + IntervalMonthDayNanoType + )), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_microsecond_array, + as_interval_ym_array, + ym_to_micro, + sign, + TimestampMicrosecondType, + IntervalYearMonthType + )), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_microsecond_array, + as_interval_dt_array, + dt_to_micro, + sign, + TimestampMicrosecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_microsecond_array, + as_interval_mdn_array, + mdn_to_micro, + sign, + TimestampMicrosecondType, + IntervalMonthDayNanoType + )), + ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_nanosecond_array, + as_interval_ym_array, + ym_to_nano, + sign, + TimestampNanosecondType, + IntervalYearMonthType + )), + ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_nanosecond_array, + as_interval_dt_array, + dt_to_nano, + sign, + TimestampNanosecondType, + IntervalDayTimeType + )), + ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Interval(IntervalUnit::MonthDayNano), + ) => Ok(ts_interval_op!( + array_lhs, + array_rhs, + as_timestamp_nanosecond_array, + as_interval_mdn_array, + mdn_to_nano, + sign, + TimestampNanosecondType, + IntervalMonthDayNanoType + )), + (_, _) => Err(DataFusionError::Execution(format!( + "Invalid array types for Timestamp Interval operation: {:?} {} {:?}", + array_lhs.data_type(), + sign, + array_rhs.data_type() + ))), + } +} + #[cfg(test)] mod tests { use super::*; From e14a16fa61e11fa4a6995cfa3dc103ddcc8c4c9c Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:07:10 +0300 Subject: [PATCH 23/55] Code refactor --- datafusion/common/src/scalar.rs | 35 ++--- datafusion/expr/src/type_coercion/binary.rs | 138 +++++++++--------- .../physical-expr/src/expressions/datetime.rs | 22 +-- 3 files changed, 99 insertions(+), 96 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 7b845257efb1..21d50b4b9845 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -639,23 +639,21 @@ macro_rules! impl_op { "Overflow while converting seconds to milliseconds".to_string(), ) }; - ts_sub_to_interval( + ts_sub_to_interval::( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, - IntervalMode::Milli, ) }, ( ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval( + ) => ts_sub_to_interval::( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, - IntervalMode::Milli, ), ( ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), @@ -666,23 +664,21 @@ macro_rules! impl_op { "Overflow while converting microseconds to nanoseconds".to_string(), ) }; - ts_sub_to_interval( + ts_sub_to_interval::( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, tz_lhs, tz_rhs, - IntervalMode::Nano, ) }, ( ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval( + ) => ts_sub_to_interval::( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, - IntervalMode::Nano, ), _ => impl_op_arithmetic!($LHS, $RHS, -) } @@ -971,34 +967,36 @@ pub enum IntervalMode { Nano, } +pub const MILLISECOND_MODE: bool = false; +pub const NANOSECOND_MODE: bool = true; + /// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones /// into account when given. Units of the resulting interval is specified by -/// the argument `mode`. +/// the constant `INTERVAL_MODE`. /// The default behavior of Datafusion is the following: /// - When subtracting timestamps at seconds/milliseconds precision, the output /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the /// output interval will have the type [`IntervalMonthDayNanoType`]. -fn ts_sub_to_interval( +fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, - mode: IntervalMode, ) -> Result { - let lhs_dt = with_timezone_to_naive_datetime(lhs_ts, lhs_tz, mode)?; - let rhs_dt = with_timezone_to_naive_datetime(rhs_ts, rhs_tz, mode)?; + let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; + let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); - match mode { - IntervalMode::Milli => { + match INTERVAL_MODE { + MILLISECOND_MODE => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( (as_millisecs / MILLISECS_IN_ONE_DAY) as i32, (as_millisecs % MILLISECS_IN_ONE_DAY) as i32, )) } - IntervalMode::Nano => { + NANOSECOND_MODE => { let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "Can not compute timestamp differences with nanosecond precision", @@ -1016,12 +1014,11 @@ fn ts_sub_to_interval( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] -pub fn with_timezone_to_naive_datetime( +pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, - mode: IntervalMode, ) -> Result { - let datetime = if let IntervalMode::Milli = mode { + let datetime = if INTERVAL_MODE == MILLISECOND_MODE { ticks_to_naive_datetime::<1_000_000>(ts) } else { ticks_to_naive_datetime::<1>(ts) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 706c79994a88..e85bc25f6989 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -210,84 +210,88 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Result> { - // interval + date or timestamp - if is_interval(lhs_type) && (is_date(rhs_type) || is_timestamp(rhs_type)) { - return Ok(Some(rhs_type.clone())); - } - - // date or timestamp + - interval - if is_interval(rhs_type) && (is_date(lhs_type) || is_timestamp(lhs_type)) { - return Ok(Some(lhs_type.clone())); - } - - // timestamp - timestamp - if is_timestamp(lhs_type) && is_timestamp(rhs_type) && (*op == Operator::Minus) { - // At this stage, a timestamp can be subtracted from a timestamp only if they - // have the same type. To not lose data, second and millisecond precision - // timestamps give output in the type of `IntervalDayTime`, and microsecond - // and nanosecond precision timestamps give in the type of `IntervalMonthDayNano`. - // A nanosecond precision subtraction may result in `IntervalYearMonth` or - // `IntervalDayTime` without loss of data, however; we need to be deterministic - // while determining the type of the output. - match (lhs_type, rhs_type) { - ( - DataType::Timestamp(TimeUnit::Second, _), - DataType::Timestamp(TimeUnit::Second, _), - ) - | ( - DataType::Timestamp(TimeUnit::Millisecond, _), - DataType::Timestamp(TimeUnit::Millisecond, _), - ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - DataType::Timestamp(TimeUnit::Microsecond, _), - ) - | ( - DataType::Timestamp(TimeUnit::Nanosecond, _), - DataType::Timestamp(TimeUnit::Nanosecond, _), - ) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), - (_, _) => { - return Err(DataFusionError::Plan( - "The timestamps have different types".to_string(), - )); - } + match (lhs_type, rhs_type, op) { + // if an interval is being added/subtracted from a date/timestamp, return the date/timestamp data type + (lhs, rhs, _) if is_interval(lhs) && (is_date(rhs) || is_timestamp(rhs)) => { + Ok(Some(rhs.clone())) } + (lhs, rhs, _) if is_interval(rhs) && (is_date(lhs) || is_timestamp(lhs)) => { + Ok(Some(lhs.clone())) + } + // if two timestamps are being subtracted, check their time units and return the corresponding interval data type + (lhs, rhs, Operator::Minus) if is_timestamp(lhs) && is_timestamp(rhs) => { + handle_timestamp_minus(lhs, rhs) + } + // if two intervals are being added/subtracted, check their interval units and return the corresponding interval data type + (lhs, rhs, _) if is_interval(lhs) && is_interval(rhs) => handle_interval_addition(lhs, rhs), + // if two date/timestamp are being added/subtracted, return an error indicating that the operation is not supported + (lhs, rhs, _) if (is_date(lhs) || is_timestamp(lhs)) && (is_date(rhs) || is_timestamp(rhs)) => { + Err(DataFusionError::Plan(format!( + "'{:?} {} {:?}' is an unsupported operation. \ + addition/subtraction on dates/timestamps only supported with interval types + ", + lhs_type, op, rhs_type + ))) + } + // return None if no coercion is possible + _ => Ok(None), } +} - // interval + interval - if is_interval(lhs_type) && is_interval(rhs_type) { - match (lhs_type, rhs_type) { - // operation with the same types - ( - DataType::Interval(IntervalUnit::YearMonth), - DataType::Interval(IntervalUnit::YearMonth), - ) => return Ok(Some(DataType::Interval(IntervalUnit::YearMonth))), - ( - DataType::Interval(IntervalUnit::DayTime), - DataType::Interval(IntervalUnit::DayTime), - ) => return Ok(Some(DataType::Interval(IntervalUnit::DayTime))), - // operation with MonthDayNano's or different types - (_, _) => return Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), - } +// This function checks if two interval data types have the same interval unit and returns an interval data type +// representing the sum of them. If the two interval data types have different units, it returns an interval data type +// with "IntervalUnit::MonthDayNano". If the two interval data types are already "IntervalUnit::YearMonth" or "IntervalUnit::DayTime", +// it returns an interval data type with the same unit as the operands. +fn handle_interval_addition(lhs: &DataType, rhs: &DataType) -> Result> { + match (lhs, rhs) { + // operation with the same types + ( + DataType::Interval(IntervalUnit::YearMonth), + DataType::Interval(IntervalUnit::YearMonth), + ) => Ok(Some(DataType::Interval(IntervalUnit::YearMonth))), + ( + DataType::Interval(IntervalUnit::DayTime), + DataType::Interval(IntervalUnit::DayTime), + ) => Ok(Some(DataType::Interval(IntervalUnit::DayTime))), + // operation with MonthDayNano's or different types + (_, _) => Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), } +} - // date + date or timestamp + timestamp with + operator - if (is_date(lhs_type) || is_timestamp(lhs_type)) - && (is_date(rhs_type) || is_timestamp(rhs_type)) - { - return Err(DataFusionError::Plan( - format!( - "'{lhs_type:?} {op} {rhs_type:?}' is an unsupported operation. \ - addition/subtraction on dates/timestamps only supported with interval types" - ),)); +// This function checks if two timestamp data types have the same time unit and returns an interval data type +// representing the difference between them, either "IntervalUnit::DayTime" if the time unit is second or millisecond, +// or "IntervalUnit::MonthDayNano" if the time unit is microsecond or nanosecond. If the two timestamp data types have +// different time units, it returns an error indicating that "The timestamps have different types". +fn handle_timestamp_minus(lhs: &DataType, rhs: &DataType) -> Result> { + match (lhs, rhs) { + ( + DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, _), + ) + | ( + DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, _), + ) => Ok(Some(DataType::Interval(IntervalUnit::DayTime))), + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, _), + ) + | ( + DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, _), + ) => Ok(Some(DataType::Interval(IntervalUnit::MonthDayNano))), + (_, _) => Err(DataFusionError::Plan( + "The timestamps have different types".to_string(), + )), } - Ok(None) } /// Returns the output type of applying numeric operations such as `=` diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 3de61c16bebc..95b85de5993b 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -153,7 +153,7 @@ impl PhysicalExpr for DateTimeIntervalExpr { } (ColumnarValue::Array(array_lhs), ColumnarValue::Array(array_rhs)) => { - evaluate_arrays(&array_lhs, sign, &array_rhs) + evaluate_temporal_arrays(&array_lhs, sign, &array_rhs) } (_, _) => { let msg = "If RHS of the operation is an array, then LHS also must be"; @@ -257,17 +257,15 @@ macro_rules! ts_sub_op { prim_array_rhs, |ts1, ts2| { $op( - with_timezone_to_naive_datetime( + with_timezone_to_naive_datetime::<$mode>( ts1.mul_wrapping($coef), &$lhs_tz, - $mode, ) .expect("{ts1} timestamp cannot build a DateTime object") .timestamp(), - with_timezone_to_naive_datetime( + with_timezone_to_naive_datetime::<$mode>( ts2.mul_wrapping($coef), &$rhs_tz, - $mode, ) .expect("{ts2} timestamp cannot build a DateTime object") .timestamp(), @@ -320,7 +318,11 @@ macro_rules! ts_interval_op { ret }}; } -pub fn evaluate_arrays( +// This function evaluates temporal array operations, such as timestamp - timestamp, interval + interval, +// timestamp + interval, and interval + timestamp. It takes two arrays as input and an integer sign representing +// the operation (+1 for addition and -1 for subtraction). It returns a ColumnarValue as output, which can hold +// either a scalar or an array. +pub fn evaluate_temporal_arrays( array_lhs: &ArrayRef, sign: i32, array_rhs: &ArrayRef, @@ -365,7 +367,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_second_array, seconds_sub, - IntervalMode::Milli, + MILLISECOND_MODE, TimestampSecondType, IntervalDayTimeType )), @@ -380,7 +382,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_millisecond_array, milliseconds_sub, - IntervalMode::Milli, + MILLISECOND_MODE, TimestampMillisecondType, IntervalDayTimeType )), @@ -395,7 +397,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_microsecond_array, microseconds_sub, - IntervalMode::Nano, + NANOSECOND_MODE, TimestampMicrosecondType, IntervalMonthDayNanoType )), @@ -410,7 +412,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_nanosecond_array, nanoseconds_sub, - IntervalMode::Nano, + NANOSECOND_MODE, TimestampNanosecondType, IntervalMonthDayNanoType )), From 9f82bbbfedd9a424023b1b793604ec5d19d4f2d3 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 24 Mar 2023 16:18:33 +0300 Subject: [PATCH 24/55] retract changes in scalar and datetime --- datafusion/common/src/scalar.rs | 35 ++++++++++--------- .../physical-expr/src/expressions/datetime.rs | 22 ++++++------ 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 21d50b4b9845..7b845257efb1 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -639,21 +639,23 @@ macro_rules! impl_op { "Overflow while converting seconds to milliseconds".to_string(), ) }; - ts_sub_to_interval::( + ts_sub_to_interval( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, + IntervalMode::Milli, ) }, ( ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval::( + ) => ts_sub_to_interval( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, + IntervalMode::Milli, ), ( ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), @@ -664,21 +666,23 @@ macro_rules! impl_op { "Overflow while converting microseconds to nanoseconds".to_string(), ) }; - ts_sub_to_interval::( + ts_sub_to_interval( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, tz_lhs, tz_rhs, + IntervalMode::Nano, ) }, ( ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval::( + ) => ts_sub_to_interval( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, + IntervalMode::Nano, ), _ => impl_op_arithmetic!($LHS, $RHS, -) } @@ -967,36 +971,34 @@ pub enum IntervalMode { Nano, } -pub const MILLISECOND_MODE: bool = false; -pub const NANOSECOND_MODE: bool = true; - /// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones /// into account when given. Units of the resulting interval is specified by -/// the constant `INTERVAL_MODE`. +/// the argument `mode`. /// The default behavior of Datafusion is the following: /// - When subtracting timestamps at seconds/milliseconds precision, the output /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the /// output interval will have the type [`IntervalMonthDayNanoType`]. -fn ts_sub_to_interval( +fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, + mode: IntervalMode, ) -> Result { - let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; - let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; + let lhs_dt = with_timezone_to_naive_datetime(lhs_ts, lhs_tz, mode)?; + let rhs_dt = with_timezone_to_naive_datetime(rhs_ts, rhs_tz, mode)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); - match INTERVAL_MODE { - MILLISECOND_MODE => { + match mode { + IntervalMode::Milli => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( (as_millisecs / MILLISECS_IN_ONE_DAY) as i32, (as_millisecs % MILLISECS_IN_ONE_DAY) as i32, )) } - NANOSECOND_MODE => { + IntervalMode::Nano => { let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "Can not compute timestamp differences with nanosecond precision", @@ -1014,11 +1016,12 @@ fn ts_sub_to_interval( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] -pub fn with_timezone_to_naive_datetime( +pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, + mode: IntervalMode, ) -> Result { - let datetime = if INTERVAL_MODE == MILLISECOND_MODE { + let datetime = if let IntervalMode::Milli = mode { ticks_to_naive_datetime::<1_000_000>(ts) } else { ticks_to_naive_datetime::<1>(ts) diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 95b85de5993b..3de61c16bebc 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -153,7 +153,7 @@ impl PhysicalExpr for DateTimeIntervalExpr { } (ColumnarValue::Array(array_lhs), ColumnarValue::Array(array_rhs)) => { - evaluate_temporal_arrays(&array_lhs, sign, &array_rhs) + evaluate_arrays(&array_lhs, sign, &array_rhs) } (_, _) => { let msg = "If RHS of the operation is an array, then LHS also must be"; @@ -257,15 +257,17 @@ macro_rules! ts_sub_op { prim_array_rhs, |ts1, ts2| { $op( - with_timezone_to_naive_datetime::<$mode>( + with_timezone_to_naive_datetime( ts1.mul_wrapping($coef), &$lhs_tz, + $mode, ) .expect("{ts1} timestamp cannot build a DateTime object") .timestamp(), - with_timezone_to_naive_datetime::<$mode>( + with_timezone_to_naive_datetime( ts2.mul_wrapping($coef), &$rhs_tz, + $mode, ) .expect("{ts2} timestamp cannot build a DateTime object") .timestamp(), @@ -318,11 +320,7 @@ macro_rules! ts_interval_op { ret }}; } -// This function evaluates temporal array operations, such as timestamp - timestamp, interval + interval, -// timestamp + interval, and interval + timestamp. It takes two arrays as input and an integer sign representing -// the operation (+1 for addition and -1 for subtraction). It returns a ColumnarValue as output, which can hold -// either a scalar or an array. -pub fn evaluate_temporal_arrays( +pub fn evaluate_arrays( array_lhs: &ArrayRef, sign: i32, array_rhs: &ArrayRef, @@ -367,7 +365,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_second_array, seconds_sub, - MILLISECOND_MODE, + IntervalMode::Milli, TimestampSecondType, IntervalDayTimeType )), @@ -382,7 +380,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_millisecond_array, milliseconds_sub, - MILLISECOND_MODE, + IntervalMode::Milli, TimestampMillisecondType, IntervalDayTimeType )), @@ -397,7 +395,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_microsecond_array, microseconds_sub, - NANOSECOND_MODE, + IntervalMode::Nano, TimestampMicrosecondType, IntervalMonthDayNanoType )), @@ -412,7 +410,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_nanosecond_array, nanoseconds_sub, - NANOSECOND_MODE, + IntervalMode::Nano, TimestampNanosecondType, IntervalMonthDayNanoType )), From 25d76f34a3f08034dc386eb2331e2e39940d9986 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Fri, 24 Mar 2023 14:23:35 +0300 Subject: [PATCH 25/55] ts op interval with chrono functions --- datafusion/common/src/scalar.rs | 109 ++++++++++++++++-- datafusion/core/tests/sql/timestamp.rs | 20 ++-- .../physical-expr/src/expressions/datetime.rs | 79 +++++++------ 3 files changed, 151 insertions(+), 57 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 7b845257efb1..2db38405ef15 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -644,7 +644,7 @@ macro_rules! impl_op { ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, - IntervalMode::Milli, + TimestampMode::Milli, ) }, ( @@ -655,7 +655,7 @@ macro_rules! impl_op { *ts_rhs, tz_lhs, tz_rhs, - IntervalMode::Milli, + TimestampMode::Milli, ), ( ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), @@ -671,7 +671,7 @@ macro_rules! impl_op { ts_rhs.checked_mul(1_000).ok_or_else(err)?, tz_lhs, tz_rhs, - IntervalMode::Nano, + TimestampMode::Nano, ) }, ( @@ -682,7 +682,7 @@ macro_rules! impl_op { *ts_rhs, tz_lhs, tz_rhs, - IntervalMode::Nano, + TimestampMode::Nano, ), _ => impl_op_arithmetic!($LHS, $RHS, -) } @@ -966,11 +966,18 @@ macro_rules! get_sign { } #[derive(Clone, Copy)] -pub enum IntervalMode { +pub enum TimestampMode { Milli, Nano, } +#[derive(Clone, Copy)] +pub enum IntervalMode { + YM, + DT, + MDN, +} + /// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones /// into account when given. Units of the resulting interval is specified by /// the argument `mode`. @@ -984,21 +991,21 @@ fn ts_sub_to_interval( rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, - mode: IntervalMode, + mode: TimestampMode, ) -> Result { let lhs_dt = with_timezone_to_naive_datetime(lhs_ts, lhs_tz, mode)?; let rhs_dt = with_timezone_to_naive_datetime(rhs_ts, rhs_tz, mode)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); match mode { - IntervalMode::Milli => { + TimestampMode::Milli => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( (as_millisecs / MILLISECS_IN_ONE_DAY) as i32, (as_millisecs % MILLISECS_IN_ONE_DAY) as i32, )) } - IntervalMode::Nano => { + TimestampMode::Nano => { let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "Can not compute timestamp differences with nanosecond precision", @@ -1019,9 +1026,9 @@ fn ts_sub_to_interval( pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, - mode: IntervalMode, + mode: TimestampMode, ) -> Result { - let datetime = if let IntervalMode::Milli = mode { + let datetime = if let TimestampMode::Milli = mode { ticks_to_naive_datetime::<1_000_000>(ts) } else { ticks_to_naive_datetime::<1>(ts) @@ -1078,6 +1085,17 @@ pub fn seconds_add(ts_s: i64, scalar: &ScalarValue, sign: i32) -> Result { do_date_time_math(ts_s, 0, scalar, sign).map(|dt| dt.timestamp()) } +#[inline] +pub fn seconds_add_array( + ts_s: i64, + interval: i128, + sign: i32, + interval_mode: IntervalMode, +) -> Result { + do_date_time_math_array(ts_s, 0, interval, sign, interval_mode) + .map(|dt| dt.timestamp()) +} + #[inline] pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_ms / 1000; @@ -1085,6 +1103,19 @@ pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result Result { + let secs = ts_ms / 1000; + let nsecs = ((ts_ms % 1000) * 1_000_000) as u32; + do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + .map(|dt| dt.timestamp_millis()) +} + #[inline] pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_us / 1_000_000; @@ -1092,6 +1123,19 @@ pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result Result { + let secs = ts_us / 1_000_000; + let nsecs = ((ts_us % 1_000_000) * 1000) as u32; + do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + .map(|dt| dt.timestamp_nanos() / 1000) +} + #[inline] pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result { let secs = ts_ns / 1_000_000_000; @@ -1099,6 +1143,19 @@ pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result Result { + let secs = ts_ns / 1_000_000_000; + let nsecs = (ts_ns % 1_000_000_000) as u32; + do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + .map(|dt| dt.timestamp_nanos()) +} + #[inline] pub fn seconds_sub(ts_lhs: i64, ts_rhs: i64) -> i64 { let diff_ms = (ts_lhs - ts_rhs) * 1000; @@ -1143,6 +1200,22 @@ fn do_date_time_math( do_date_math(prior, scalar, sign) } +#[inline] +fn do_date_time_math_array( + secs: i64, + nsecs: u32, + interval: i128, + sign: i32, + interval_mode: IntervalMode, +) -> Result { + let prior = NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or_else(|| { + DataFusionError::Internal(format!( + "Could not conert to NaiveDateTime: secs {secs} nsecs {nsecs}" + )) + })?; + do_date_math_array(prior, interval, sign, interval_mode) +} + fn do_date_math(prior: D, scalar: &ScalarValue, sign: i32) -> Result where D: Datelike + Add, @@ -1157,6 +1230,22 @@ where }) } +fn do_date_math_array( + prior: D, + interval: i128, + sign: i32, + interval_mode: IntervalMode, +) -> Result +where + D: Datelike + Add, +{ + Ok(match interval_mode { + IntervalMode::DT => add_day_time(prior, interval as i64, sign), + IntervalMode::YM => shift_months(prior, interval as i32 * sign), + IntervalMode::MDN => add_m_d_nano(prior, interval, sign), + }) +} + // Can remove once chrono:0.4.23 is released fn add_m_d_nano(prior: D, interval: i128, sign: i32) -> D where diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index 364c7da5945e..6e0f21b1a7c9 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1825,9 +1825,9 @@ async fn ts_interval_sub() -> Result<()> { "+-----+---------------------+", "| val | ts_interval_diff |", "+-----+---------------------+", - "| 3 | 2021-12-12T03:23:40 |", - "| 2 | 2022-11-11T09:20:20 |", - "| 1 | 2022-04-19T15:17:00 |", + "| 3 | 2021-12-02T03:23:40 |", + "| 2 | 2022-11-09T09:20:20 |", + "| 1 | 2022-04-15T15:17:00 |", "+-----+---------------------+", ]; assert_batches_eq!(expected, &actual); @@ -1839,8 +1839,8 @@ async fn ts_interval_sub() -> Result<()> { "| val | ts_interval_diff |", "+-----+---------------------+", "| 3 | 2023-11-02T03:23:39 |", - "| 2 | 2023-07-09T09:20:20 |", - "| 1 | 2023-03-15T15:17:00 |", + "| 2 | 2023-07-09T09:20:19 |", + "| 1 | 2023-03-15T15:16:59 |", "+-----+---------------------+", ]; assert_batches_eq!(expected, &actual); @@ -1886,9 +1886,9 @@ async fn interval_ts_add() -> Result<()> { "+-----+---------------------+", "| val | interval_sum_ts |", "+-----+---------------------+", - "| 3 | 2024-03-31T03:23:40 |", - "| 2 | 2024-02-04T09:20:20 |", - "| 1 | 2023-06-13T15:17:00 |", + "| 3 | 2024-04-02T03:23:40 |", + "| 2 | 2024-02-09T09:20:20 |", + "| 1 | 2023-06-15T15:17:00 |", "+-----+---------------------+", ]; assert_batches_eq!(expected, &actual); @@ -1913,8 +1913,8 @@ async fn interval_ts_add() -> Result<()> { "| val | interval_sum_ts |", "+-----+-------------------------------+", "| 3 | 2023-11-03T02:33:40.003300294 |", - "| 2 | 2023-12-16T08:47:00.001500005 |", - "| 1 | 2023-06-27T15:00:20.002265537 |", + "| 2 | 2023-12-19T08:47:00.001500005 |", + "| 1 | 2023-06-29T15:00:20.002265537 |", "+-----+-------------------------------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 3de61c16bebc..aa070e84824b 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -302,20 +302,13 @@ macro_rules! interval_cross_op { }}; } macro_rules! ts_interval_op { - ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ + ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty, $mode:expr) => {{ let prim_array_lhs = $caster1(&$lhs)?; let prim_array_rhs = $caster2(&$rhs)?; let ret = Arc::new(binary::<$type_in1, $type_in2, _, $type_in1>( prim_array_lhs, prim_array_rhs, - |ts, interval| { - ts.add_wrapping( - $sign as i64 - * $op(&Some(interval)) - .expect("Interval cannot computed as nanosecond") - as i64, - ) - }, + |ts, interval| $op(ts, interval as i128, $sign, $mode).unwrap(), )?) as ArrayRef; ret }}; @@ -365,7 +358,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_second_array, seconds_sub, - IntervalMode::Milli, + TimestampMode::Milli, TimestampSecondType, IntervalDayTimeType )), @@ -380,7 +373,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_millisecond_array, milliseconds_sub, - IntervalMode::Milli, + TimestampMode::Milli, TimestampMillisecondType, IntervalDayTimeType )), @@ -395,7 +388,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_microsecond_array, microseconds_sub, - IntervalMode::Nano, + TimestampMode::Nano, TimestampMicrosecondType, IntervalMonthDayNanoType )), @@ -410,7 +403,7 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_nanosecond_array, nanoseconds_sub, - IntervalMode::Nano, + TimestampMode::Nano, TimestampNanosecondType, IntervalMonthDayNanoType )), @@ -568,10 +561,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_ym_array, - ym_to_sec, + seconds_add_array, sign, TimestampSecondType, - IntervalYearMonthType + IntervalYearMonthType, + IntervalMode::YM )), ( DataType::Timestamp(TimeUnit::Second, _), @@ -581,10 +575,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_dt_array, - dt_to_sec, + seconds_add_array, sign, TimestampSecondType, - IntervalDayTimeType + IntervalDayTimeType, + IntervalMode::DT )), ( DataType::Timestamp(TimeUnit::Second, _), @@ -594,10 +589,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_mdn_array, - mdn_to_sec, + seconds_add_array, sign, TimestampSecondType, - IntervalMonthDayNanoType + IntervalMonthDayNanoType, + IntervalMode::MDN )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -607,10 +603,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_ym_array, - ym_to_milli, + milliseconds_add_array, sign, TimestampMillisecondType, - IntervalYearMonthType + IntervalYearMonthType, + IntervalMode::YM )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -620,10 +617,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_dt_array, - dt_to_milli, + milliseconds_add_array, sign, TimestampMillisecondType, - IntervalDayTimeType + IntervalDayTimeType, + IntervalMode::DT )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -633,10 +631,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_mdn_array, - mdn_to_milli, + milliseconds_add_array, sign, TimestampMillisecondType, - IntervalMonthDayNanoType + IntervalMonthDayNanoType, + IntervalMode::MDN )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -646,10 +645,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_ym_array, - ym_to_micro, + microseconds_add_array, sign, TimestampMicrosecondType, - IntervalYearMonthType + IntervalYearMonthType, + IntervalMode::YM )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -659,10 +659,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_dt_array, - dt_to_micro, + microseconds_add_array, sign, TimestampMicrosecondType, - IntervalDayTimeType + IntervalDayTimeType, + IntervalMode::DT )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -672,10 +673,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_mdn_array, - mdn_to_micro, + microseconds_add_array, sign, TimestampMicrosecondType, - IntervalMonthDayNanoType + IntervalMonthDayNanoType, + IntervalMode::MDN )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -685,10 +687,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_ym_array, - ym_to_nano, + nanoseconds_add_array, sign, TimestampNanosecondType, - IntervalYearMonthType + IntervalYearMonthType, + IntervalMode::YM )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -698,10 +701,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_dt_array, - dt_to_nano, + nanoseconds_add_array, sign, TimestampNanosecondType, - IntervalDayTimeType + IntervalDayTimeType, + IntervalMode::DT )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -711,10 +715,11 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_mdn_array, - mdn_to_nano, + nanoseconds_add_array, sign, TimestampNanosecondType, - IntervalMonthDayNanoType + IntervalMonthDayNanoType, + IntervalMode::MDN )), (_, _) => Err(DataFusionError::Execution(format!( "Invalid array types for Timestamp Interval operation: {:?} {} {:?}", From 9de78758cd94ce9b9ffd98bf6e7bdb01e4ca8ac8 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Sun, 26 Mar 2023 13:31:59 +0300 Subject: [PATCH 26/55] bug fix and refactor --- datafusion/common/src/scalar.rs | 40 +++++++-------- datafusion/core/tests/sql/timestamp.rs | 18 ++++++- .../sqllogictests/test_files/timestamps.slt | 26 +++++++++- .../physical-expr/src/expressions/datetime.rs | 51 +++++++++++-------- 4 files changed, 90 insertions(+), 45 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 2db38405ef15..f260bf0702d3 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -639,23 +639,21 @@ macro_rules! impl_op { "Overflow while converting seconds to milliseconds".to_string(), ) }; - ts_sub_to_interval( + ts_sub_to_interval::( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, &tz_lhs, &tz_rhs, - TimestampMode::Milli, ) }, ( ScalarValue::TimestampMillisecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampMillisecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval( + ) => ts_sub_to_interval::( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, - TimestampMode::Milli, ), ( ScalarValue::TimestampMicrosecond(Some(ts_lhs), tz_lhs), @@ -666,23 +664,21 @@ macro_rules! impl_op { "Overflow while converting microseconds to nanoseconds".to_string(), ) }; - ts_sub_to_interval( + ts_sub_to_interval::( ts_lhs.checked_mul(1_000).ok_or_else(err)?, ts_rhs.checked_mul(1_000).ok_or_else(err)?, tz_lhs, tz_rhs, - TimestampMode::Nano, ) }, ( ScalarValue::TimestampNanosecond(Some(ts_lhs), tz_lhs), ScalarValue::TimestampNanosecond(Some(ts_rhs), tz_rhs), - ) => ts_sub_to_interval( + ) => ts_sub_to_interval::( *ts_lhs, *ts_rhs, tz_lhs, tz_rhs, - TimestampMode::Nano, ), _ => impl_op_arithmetic!($LHS, $RHS, -) } @@ -978,34 +974,35 @@ pub enum IntervalMode { MDN, } +pub const MILLISECOND_MODE: bool = false; +pub const NANOSECOND_MODE: bool = true; /// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones /// into account when given. Units of the resulting interval is specified by -/// the argument `mode`. +/// the constant `INTERVAL_MODE`. /// The default behavior of Datafusion is the following: /// - When subtracting timestamps at seconds/milliseconds precision, the output /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the /// output interval will have the type [`IntervalMonthDayNanoType`]. -fn ts_sub_to_interval( +fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, - mode: TimestampMode, ) -> Result { - let lhs_dt = with_timezone_to_naive_datetime(lhs_ts, lhs_tz, mode)?; - let rhs_dt = with_timezone_to_naive_datetime(rhs_ts, rhs_tz, mode)?; + let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; + let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); - match mode { - TimestampMode::Milli => { + match INTERVAL_MODE { + MILLISECOND_MODE => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( (as_millisecs / MILLISECS_IN_ONE_DAY) as i32, (as_millisecs % MILLISECS_IN_ONE_DAY) as i32, )) } - TimestampMode::Nano => { + NANOSECOND_MODE => { let as_nanosecs = delta_secs.num_nanoseconds().ok_or_else(|| { DataFusionError::Execution(String::from( "Can not compute timestamp differences with nanosecond precision", @@ -1023,12 +1020,11 @@ fn ts_sub_to_interval( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] -pub fn with_timezone_to_naive_datetime( +pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, - mode: TimestampMode, ) -> Result { - let datetime = if let TimestampMode::Milli = mode { + let datetime = if INTERVAL_MODE == MILLISECOND_MODE { ticks_to_naive_datetime::<1_000_000>(ts) } else { ticks_to_naive_datetime::<1>(ts) @@ -1054,10 +1050,10 @@ pub fn with_timezone_to_naive_datetime( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp, whose tick size is specified by `UNIT_NANOS`. #[inline] -fn ticks_to_naive_datetime(ticks: i64) -> Result { +fn ticks_to_naive_datetime(ticks: i64) -> Result { NaiveDateTime::from_timestamp_opt( - (ticks * UNIT_NANOS) / 1_000_000_000, - ((ticks * UNIT_NANOS) % 1_000_000_000) as u32, + ((ticks as i128 * UNIT_NANOS) / 1_000_000_000) as i64, + ((ticks as i128 * UNIT_NANOS) % 1_000_000_000) as u32, ) .ok_or_else(|| { DataFusionError::Execution( diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index 6e0f21b1a7c9..d3538480039c 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1711,13 +1711,29 @@ async fn timestamp_sub_simple() -> Result<()> { ]; assert_batches_eq!(expected, &actual); + let table_b = make_timestamp_sub_table::()?; + ctx.register_table("table_b", table_b)?; + + let sql = "SELECT val, ts1 - ts2 AS ts_diff FROM table_b ORDER BY ts2 - ts1"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-----+--------------------------------------------------------+", + "| val | ts_diff |", + "+-----+--------------------------------------------------------+", + "| 3 | 0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs |", + "| 1 | 0 years 0 mons 0 days 0 hours 0 mins 20.000000000 secs |", + "| 2 | 0 years 0 mons 0 days 0 hours 0 mins 10.000000000 secs |", + "+-----+--------------------------------------------------------+", + ]; + assert_batches_eq!(expected, &actual); + Ok(()) } #[tokio::test] async fn timestamp_sub_with_tz() -> Result<()> { let ctx = SessionContext::new(); - let table_a = make_timestamp_tz_sub_table::( + let table_a = make_timestamp_tz_sub_table::( Some("America/Los_Angeles".to_string()), Some("Europe/Istanbul".to_string()), )?; diff --git a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt index 7ca513b99bad..25af7b328adf 100644 --- a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt +++ b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt @@ -237,4 +237,28 @@ SELECT INTERVAL '8' YEAR + '2000-01-01T00:00:00'::timestamp; query P SELECT INTERVAL '8' MONTH + '2000-01-01T00:00:00'::timestamp; ---- -2000-09-01T00:00:00 \ No newline at end of file +2000-09-01T00:00:00 + +statement ok +create table foo (val int, ts1 timestamp, ts2 timestamp) as values (1, '2023-03-15T15:00:20'::timestamp, '2023-03-15T15:00:00'::timestamp), (2, '2023-03-15T15:00:10'::timestamp, '2023-03-15T15:00:00'::timestamp); + +query I? +SELECT val, ts1 - ts2 FROM foo ORDER BY ts2 - ts1; +---- +1 0 years 0 mons 0 days 0 hours 0 mins 20.000000000 secs +2 0 years 0 mons 0 days 0 hours 0 mins 10.000000000 secs + +statement ok +drop table foo; + +statement ok +create table foo (val int, ts1 timestamp, ts2 timestamp) as values (1, '2023-03-15T15:00:20.000000123'::timestamp, '2023-03-15T15:00:00.000000099'::timestamp), (2, '2023-03-15T15:00:10.000123456'::timestamp, '2023-03-15T15:00:00.000000001'::timestamp); + +query I? +SELECT val, ts1 - ts2 FROM foo ORDER BY ts1 - ts2; +---- +2 0 years 0 mons 0 days 0 hours 0 mins 10.000123455 secs +1 0 years 0 mons 0 days 0 hours 0 mins 20.000000024 secs + +statement ok +drop table foo; \ No newline at end of file diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index aa070e84824b..a2ca8dc8ddc2 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -27,6 +27,7 @@ use arrow::datatypes::{ }; use arrow::record_batch::RecordBatch; use arrow_schema::IntervalUnit; +use chrono::NaiveDateTime; use datafusion_common::cast::*; use datafusion_common::scalar::*; use datafusion_common::Result; @@ -153,7 +154,7 @@ impl PhysicalExpr for DateTimeIntervalExpr { } (ColumnarValue::Array(array_lhs), ColumnarValue::Array(array_rhs)) => { - evaluate_arrays(&array_lhs, sign, &array_rhs) + evaluate_temporal_arrays(&array_lhs, sign, &array_rhs) } (_, _) => { let msg = "If RHS of the operation is an array, then LHS also must be"; @@ -249,7 +250,7 @@ pub fn evaluate_array( } macro_rules! ts_sub_op { - ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $mode:expr, $type_in:ty, $type_out:ty) => {{ + ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_in:ty, $type_out:ty) => {{ let prim_array_lhs = $caster(&$lhs)?; let prim_array_rhs = $caster(&$rhs)?; let ret = Arc::new(binary::<$type_in, $type_in, _, $type_out>( @@ -257,20 +258,20 @@ macro_rules! ts_sub_op { prim_array_rhs, |ts1, ts2| { $op( - with_timezone_to_naive_datetime( - ts1.mul_wrapping($coef), - &$lhs_tz, - $mode, - ) - .expect("{ts1} timestamp cannot build a DateTime object") - .timestamp(), - with_timezone_to_naive_datetime( - ts2.mul_wrapping($coef), - &$rhs_tz, - $mode, - ) - .expect("{ts2} timestamp cannot build a DateTime object") - .timestamp(), + $ts_unit( + &with_timezone_to_naive_datetime::<$mode>( + ts1.mul_wrapping($coef), + &$lhs_tz, + ) + .expect("{ts1} timestamp cannot build a DateTime object"), + ), + $ts_unit( + &with_timezone_to_naive_datetime::<$mode>( + ts2.mul_wrapping($coef), + &$rhs_tz, + ) + .expect("{ts2} timestamp cannot build a DateTime object"), + ), ) }, )?) as ArrayRef; @@ -313,7 +314,11 @@ macro_rules! ts_interval_op { ret }}; } -pub fn evaluate_arrays( +// This function evaluates temporal array operations, such as timestamp - timestamp, interval + interval, +// timestamp + interval, and interval + timestamp. It takes two arrays as input and an integer sign representing +// the operation (+1 for addition and -1 for subtraction). It returns a ColumnarValue as output, which can hold +// either a scalar or an array. +pub fn evaluate_temporal_arrays( array_lhs: &ArrayRef, sign: i32, array_rhs: &ArrayRef, @@ -358,7 +363,8 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_second_array, seconds_sub, - TimestampMode::Milli, + NaiveDateTime::timestamp, + MILLISECOND_MODE, TimestampSecondType, IntervalDayTimeType )), @@ -373,7 +379,8 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_millisecond_array, milliseconds_sub, - TimestampMode::Milli, + NaiveDateTime::timestamp_millis, + MILLISECOND_MODE, TimestampMillisecondType, IntervalDayTimeType )), @@ -388,7 +395,8 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1000i64, as_timestamp_microsecond_array, microseconds_sub, - TimestampMode::Nano, + NaiveDateTime::timestamp_micros, + NANOSECOND_MODE, TimestampMicrosecondType, IntervalMonthDayNanoType )), @@ -403,7 +411,8 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { 1i64, as_timestamp_nanosecond_array, nanoseconds_sub, - TimestampMode::Nano, + NaiveDateTime::timestamp_nanos, + NANOSECOND_MODE, TimestampNanosecondType, IntervalMonthDayNanoType )), From d637efeedd6975e4cd440c512badc522c161fe38 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 27 Mar 2023 16:24:52 +0300 Subject: [PATCH 27/55] test refactor --- datafusion/common/src/scalar.rs | 36 +++++- datafusion/core/tests/sql/mod.rs | 7 -- datafusion/core/tests/sql/timestamp.rs | 158 ++++--------------------- 3 files changed, 55 insertions(+), 146 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index f260bf0702d3..0e7eeb8ea646 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -1051,11 +1051,13 @@ pub fn with_timezone_to_naive_datetime( /// given timestamp, whose tick size is specified by `UNIT_NANOS`. #[inline] fn ticks_to_naive_datetime(ticks: i64) -> Result { - NaiveDateTime::from_timestamp_opt( - ((ticks as i128 * UNIT_NANOS) / 1_000_000_000) as i64, - ((ticks as i128 * UNIT_NANOS) % 1_000_000_000) as u32, - ) - .ok_or_else(|| { + let mut secs: i64 = ((ticks as i128 * UNIT_NANOS) / 1_000_000_000) as i64; + let mut nsecs: i32 = ((ticks as i128 * UNIT_NANOS) % 1_000_000_000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + NaiveDateTime::from_timestamp_opt(secs, nsecs as u32).ok_or_else(|| { DataFusionError::Execution( "Can not convert given timestamp to a NaiveDateTime".to_string(), ) @@ -5550,6 +5552,30 @@ mod tests { ), ScalarValue::new_interval_dt(0, 0), ), + // 11th test case, negative results + ( + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 17) + .unwrap() + .and_hms_milli_opt(4, 10, 0, 0) + .unwrap() + .timestamp_millis(), + ), + None, + ), + ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 3, 17) + .unwrap() + .and_hms_milli_opt(4, 10, 0, 1) + .unwrap() + .timestamp_millis(), + ), + None, + ), + ScalarValue::new_interval_dt(0, -sign), + ), ] } diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index f70a06adb38b..f2c366007840 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -1359,13 +1359,6 @@ where Ok(Arc::new(table)) } -fn make_timestamp_sub_table() -> Result> -where - A: ArrowTimestampType, -{ - make_timestamp_tz_sub_table::(None, None) -} - fn make_timestamp_tz_sub_table( tz1: Option, tz2: Option, diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index d3538480039c..d87f3d480261 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1692,44 +1692,6 @@ async fn test_ts_dt_binary_ops() -> Result<()> { Ok(()) } -#[tokio::test] -async fn timestamp_sub_simple() -> Result<()> { - let ctx = SessionContext::new(); - let table_a = make_timestamp_sub_table::()?; - ctx.register_table("table_a", table_a)?; - - let sql = "SELECT val, ts1 - ts2 AS ts_diff FROM table_a ORDER BY ts2 - ts1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+--------------------------------------------------+", - "| val | ts_diff |", - "+-----+--------------------------------------------------+", - "| 3 | 0 years 0 mons 0 days 0 hours 0 mins 30.000 secs |", - "| 1 | 0 years 0 mons 0 days 0 hours 0 mins 20.000 secs |", - "| 2 | 0 years 0 mons 0 days 0 hours 0 mins 10.000 secs |", - "+-----+--------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let table_b = make_timestamp_sub_table::()?; - ctx.register_table("table_b", table_b)?; - - let sql = "SELECT val, ts1 - ts2 AS ts_diff FROM table_b ORDER BY ts2 - ts1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+--------------------------------------------------------+", - "| val | ts_diff |", - "+-----+--------------------------------------------------------+", - "| 3 | 0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs |", - "| 1 | 0 years 0 mons 0 days 0 hours 0 mins 20.000000000 secs |", - "| 2 | 0 years 0 mons 0 days 0 hours 0 mins 10.000000000 secs |", - "+-----+--------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - Ok(()) -} - #[tokio::test] async fn timestamp_sub_with_tz() -> Result<()> { let ctx = SessionContext::new(); @@ -1822,58 +1784,22 @@ async fn ts_interval_sub() -> Result<()> { let table_a = make_ts_interval_table()?; ctx.register_table("table_a", table_a)?; - let sql = "SELECT val, ts_millisec1 - interval_dt1 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------+", - "| val | ts_interval_diff |", - "+-----+-------------------------+", - "| 3 | 2023-10-28T15:13:38.002 |", - "| 2 | 2023-07-06T01:13:37.999 |", - "| 1 | 2023-03-13T11:13:37.999 |", - "+-----+-------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, ts_sec1 - interval_ym1 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+---------------------+", - "| val | ts_interval_diff |", - "+-----+---------------------+", - "| 3 | 2021-12-02T03:23:40 |", - "| 2 | 2022-11-09T09:20:20 |", - "| 1 | 2022-04-15T15:17:00 |", - "+-----+---------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, ts_sec1 - interval_mdn2 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; + let sql = "SELECT val, ts_millisec1 - interval_dt1 AS ts_interval_diff, + ts_sec1 - interval_ym1 AS ts_interval_diff2, + ts_sec1 - interval_mdn2 AS ts_interval_diff3, + ts_nanosec1 - interval_dt2 AS ts_interval_diff4 + FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-----+---------------------+", - "| val | ts_interval_diff |", - "+-----+---------------------+", - "| 3 | 2023-11-02T03:23:39 |", - "| 2 | 2023-07-09T09:20:19 |", - "| 1 | 2023-03-15T15:16:59 |", - "+-----+---------------------+", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", + "| val | ts_interval_diff | ts_interval_diff2 | ts_interval_diff3 | ts_interval_diff4 |", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", + "| 3 | 2023-10-28T15:13:38.002 | 2021-12-02T03:23:40 | 2023-11-02T03:23:39 | 2023-11-01T02:33:38.003300003 |", + "| 2 | 2023-07-06T01:13:37.999 | 2022-11-09T09:20:20 | 2023-07-09T09:20:19 | 2023-07-08T08:46:58.001500004 |", + "| 1 | 2023-03-13T11:13:37.999 | 2022-04-15T15:17:00 | 2023-03-15T15:16:59 | 2023-03-14T15:00:18.002200002 |", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", ]; assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, ts_nanosec1 - interval_dt2 AS ts_interval_diff FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------------+", - "| val | ts_interval_diff |", - "+-----+-------------------------------+", - "| 3 | 2023-11-01T02:33:38.003300003 |", - "| 2 | 2023-07-08T08:46:58.001500004 |", - "| 1 | 2023-03-14T15:00:18.002200002 |", - "+-----+-------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) } @@ -1883,57 +1809,21 @@ async fn interval_ts_add() -> Result<()> { let table_a = make_ts_interval_table()?; ctx.register_table("table_a", table_a)?; - let sql = "SELECT val, interval_dt1 + ts_millisec2 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------+", - "| val | interval_sum_ts |", - "+-----+-------------------------+", - "| 3 | 2023-11-06T13:53:42.001 |", - "| 2 | 2023-07-12T16:20:22.002 |", - "| 1 | 2023-03-17T18:47:02.003 |", - "+-----+-------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_ym2 + ts_sec1 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+---------------------+", - "| val | interval_sum_ts |", - "+-----+---------------------+", - "| 3 | 2024-04-02T03:23:40 |", - "| 2 | 2024-02-09T09:20:20 |", - "| 1 | 2023-06-15T15:17:00 |", - "+-----+---------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_mdn2 + ts_sec2 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; + let sql = "SELECT val, interval_dt1 + ts_millisec2 AS interval_sum_ts, + interval_ym2 + ts_sec1 AS interval_sum_ts2, + interval_mdn2 + ts_sec2 AS interval_sum_ts3, + interval_mdn1 + ts_nanosec1 AS interval_sum_ts4 + FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-----+---------------------+", - "| val | interval_sum_ts |", - "+-----+---------------------+", - "| 3 | 2023-11-02T02:33:41 |", - "| 2 | 2023-07-09T08:47:00 |", - "| 1 | 2023-03-15T15:00:20 |", - "+-----+---------------------+", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", + "| val | interval_sum_ts | interval_sum_ts2 | interval_sum_ts3 | interval_sum_ts4 |", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", + "| 3 | 2023-11-06T13:53:42.001 | 2024-04-02T03:23:40 | 2023-11-02T02:33:41 | 2023-11-03T02:33:40.003300294 |", + "| 2 | 2023-07-12T16:20:22.002 | 2024-02-09T09:20:20 | 2023-07-09T08:47:00 | 2023-12-19T08:47:00.001500005 |", + "| 1 | 2023-03-17T18:47:02.003 | 2023-06-15T15:17:00 | 2023-03-15T15:00:20 | 2023-06-29T15:00:20.002265537 |", + "+-----+-------------------------+---------------------+---------------------+-------------------------------+", ]; assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_mdn1 + ts_nanosec1 AS interval_sum_ts FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------------+", - "| val | interval_sum_ts |", - "+-----+-------------------------------+", - "| 3 | 2023-11-03T02:33:40.003300294 |", - "| 2 | 2023-12-19T08:47:00.001500005 |", - "| 1 | 2023-06-29T15:00:20.002265537 |", - "+-----+-------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) } From e2ee0ed76a386a4e85d454dc3aecab9dd406da87 Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Mon, 27 Mar 2023 19:10:46 +0300 Subject: [PATCH 28/55] Enhance commenting --- .../physical-expr/src/expressions/datetime.rs | 13 +++++++++---- datafusion/physical-expr/src/planner.rs | 5 ++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index a2ca8dc8ddc2..314ba2757a0f 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -78,8 +78,8 @@ impl DateTimeIntervalExpr { rhs, input_schema: input_schema.clone(), }), - other => Err(DataFusionError::Execution(format!( - "Invalid operation '{other:?}' for DateIntervalExpr" + (lhs, _, rhs) => Err(DataFusionError::Execution(format!( + "Invalid operation between '{lhs}' and '{rhs}' for DateIntervalExpr" ))), } } @@ -350,6 +350,7 @@ pub fn evaluate_temporal_arrays( Ok(ColumnarValue::Array(ret)) } +/// Performs a timestamp subtraction operation on two arrays and returns the resulting array. fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { match (array_lhs.data_type(), array_rhs.data_type()) { ( @@ -423,7 +424,9 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { ))), } } - +/// Performs an interval operation on two arrays and returns the resulting array. +/// The operation sign determines whether to perform addition or subtraction. +/// The data type and unit of the two input arrays must match the supported combinations. fn interval_array_op( array_lhs: &ArrayRef, array_rhs: &ArrayRef, @@ -555,7 +558,9 @@ fn interval_array_op( ))), } } - +/// Performs a timestamp/interval operation on two arrays and returns the resulting array. +/// The operation sign determines whether to perform addition or subtraction. +/// The data type and unit of the two input arrays must match the supported combinations. fn ts_interval_array_op( array_lhs: &ArrayRef, sign: i32, diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index f4f7d6a40239..6568716bb0f5 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -170,6 +170,7 @@ pub fn create_physical_expr( ) } Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + // Create physical expressions for left and right operands let lhs = create_physical_expr( left, input_dfschema, @@ -182,6 +183,9 @@ pub fn create_physical_expr( input_schema, execution_props, )?; + // Match the data types and operator to determine the appropriate expression, if + // they are supported temporal types and operations, create DateTimeIntervalExpr, + // else create BinaryExpr. match ( lhs.data_type(input_schema)?, op, @@ -207,7 +211,6 @@ pub fn create_physical_expr( lhs, input_schema, )?)), - // Timestamp + Timestamp operations cannot reach till that point already. ( DataType::Timestamp(_, _), Operator::Minus, From 3e03a54c1b8b5faad24a7f7553386bdfb2021bc8 Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Tue, 28 Mar 2023 11:45:27 +0300 Subject: [PATCH 29/55] new binary operation logic, handling the inside errors --- .../physical-expr/src/expressions/datetime.rs | 91 +++++++++++++++---- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 314ba2757a0f..680b3718e3af 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -17,7 +17,7 @@ use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; -use arrow::array::{Array, ArrayRef}; +use arrow::array::{Array, ArrayData, ArrayRef, ArrowPrimitiveType, PrimitiveArray}; use arrow::compute::{binary, unary}; use arrow::datatypes::{ ArrowNativeTypeOp, DataType, Date32Type, Date64Type, IntervalDayTimeType, @@ -26,7 +26,9 @@ use arrow::datatypes::{ TimestampSecondType, }; use arrow::record_batch::RecordBatch; -use arrow_schema::IntervalUnit; +use arrow::util::bit_mask::combine_option_bitmap; +use arrow_buffer::Buffer; +use arrow_schema::{ArrowError, IntervalUnit}; use chrono::NaiveDateTime; use datafusion_common::cast::*; use datafusion_common::scalar::*; @@ -253,26 +255,20 @@ macro_rules! ts_sub_op { ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_in:ty, $type_out:ty) => {{ let prim_array_lhs = $caster(&$lhs)?; let prim_array_rhs = $caster(&$rhs)?; - let ret = Arc::new(binary::<$type_in, $type_in, _, $type_out>( + let ret = Arc::new(try_binary_op::<$type_in, $type_in, _, $type_out>( prim_array_lhs, prim_array_rhs, |ts1, ts2| { - $op( - $ts_unit( - &with_timezone_to_naive_datetime::<$mode>( - ts1.mul_wrapping($coef), - &$lhs_tz, - ) - .expect("{ts1} timestamp cannot build a DateTime object"), - ), - $ts_unit( - &with_timezone_to_naive_datetime::<$mode>( - ts2.mul_wrapping($coef), - &$rhs_tz, - ) - .expect("{ts2} timestamp cannot build a DateTime object"), - ), - ) + Ok($op( + $ts_unit(&with_timezone_to_naive_datetime::<$mode>( + ts1.mul_wrapping($coef), + &$lhs_tz, + )?), + $ts_unit(&with_timezone_to_naive_datetime::<$mode>( + ts2.mul_wrapping($coef), + &$rhs_tz, + )?), + )) }, )?) as ArrayRef; ret @@ -350,6 +346,63 @@ pub fn evaluate_temporal_arrays( Ok(ColumnarValue::Array(ret)) } +#[inline] +unsafe fn build_primitive_array( + len: usize, + buffer: Buffer, + null_count: usize, + null_buffer: Option, +) -> PrimitiveArray { + PrimitiveArray::from(ArrayData::new_unchecked( + O::DATA_TYPE, + len, + Some(null_count), + null_buffer, + 0, + vec![buffer], + vec![], + )) +} + +pub fn try_binary_op( + a: &PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> Result, ArrowError> +where + A: ArrowPrimitiveType, + B: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(A::Native, B::Native) -> Result, +{ + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + )); + } + let len = a.len(); + + if a.is_empty() { + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); + } + + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits_offset(0, len)) + .unwrap_or_default(); + + let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size from a PrimitiveArray + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; + + Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) +} + /// Performs a timestamp subtraction operation on two arrays and returns the resulting array. fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { match (array_lhs.data_type(), array_rhs.data_type()) { From 03d3aed6f1803aaacb0a0b8c1809c127845297ce Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 28 Mar 2023 15:12:22 +0300 Subject: [PATCH 30/55] slt and minor changes --- datafusion/common/src/scalar.rs | 70 +++---- datafusion/core/tests/sql/mod.rs | 185 ------------------ datafusion/core/tests/sql/timestamp.rs | 112 +---------- .../sqllogictests/test_files/timestamps.slt | 106 ++++++++-- datafusion/expr/src/type_coercion/binary.rs | 12 +- .../physical-expr/src/expressions/datetime.rs | 67 +++---- 6 files changed, 159 insertions(+), 393 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 0e7eeb8ea646..76cbb982bec8 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -961,40 +961,31 @@ macro_rules! get_sign { }; } -#[derive(Clone, Copy)] -pub enum TimestampMode { - Milli, - Nano, -} - -#[derive(Clone, Copy)] -pub enum IntervalMode { - YM, - DT, - MDN, -} +pub const YM_MODE: i8 = 0; +pub const DT_MODE: i8 = 1; +pub const MDN_MODE: i8 = 2; pub const MILLISECOND_MODE: bool = false; pub const NANOSECOND_MODE: bool = true; /// This function computes subtracts `rhs_ts` from `lhs_ts`, taking timezones /// into account when given. Units of the resulting interval is specified by -/// the constant `INTERVAL_MODE`. +/// the constant `TIME_MODE`. /// The default behavior of Datafusion is the following: /// - When subtracting timestamps at seconds/milliseconds precision, the output /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the /// output interval will have the type [`IntervalMonthDayNanoType`]. -fn ts_sub_to_interval( +fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; - let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; + let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; + let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); - match INTERVAL_MODE { + match TIME_MOD { MILLISECOND_MODE => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( @@ -1020,11 +1011,11 @@ fn ts_sub_to_interval( /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] -pub fn with_timezone_to_naive_datetime( +pub fn with_timezone_to_naive_datetime( ts: i64, tz: &Option, ) -> Result { - let datetime = if INTERVAL_MODE == MILLISECOND_MODE { + let datetime = if TIME_MODE == MILLISECOND_MODE { ticks_to_naive_datetime::<1_000_000>(ts) } else { ticks_to_naive_datetime::<1>(ts) @@ -1084,13 +1075,12 @@ pub fn seconds_add(ts_s: i64, scalar: &ScalarValue, sign: i32) -> Result { } #[inline] -pub fn seconds_add_array( +pub fn seconds_add_array( ts_s: i64, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result { - do_date_time_math_array(ts_s, 0, interval, sign, interval_mode) + do_date_time_math_array::(ts_s, 0, interval, sign) .map(|dt| dt.timestamp()) } @@ -1102,15 +1092,14 @@ pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result( ts_ms: i64, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result { let secs = ts_ms / 1000; let nsecs = ((ts_ms % 1000) * 1_000_000) as u32; - do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + do_date_time_math_array::(secs, nsecs, interval, sign) .map(|dt| dt.timestamp_millis()) } @@ -1122,15 +1111,14 @@ pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result( ts_us: i64, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result { let secs = ts_us / 1_000_000; let nsecs = ((ts_us % 1_000_000) * 1000) as u32; - do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + do_date_time_math_array::(secs, nsecs, interval, sign) .map(|dt| dt.timestamp_nanos() / 1000) } @@ -1142,15 +1130,14 @@ pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result( ts_ns: i64, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result { let secs = ts_ns / 1_000_000_000; let nsecs = (ts_ns % 1_000_000_000) as u32; - do_date_time_math_array(secs, nsecs, interval, sign, interval_mode) + do_date_time_math_array::(secs, nsecs, interval, sign) .map(|dt| dt.timestamp_nanos()) } @@ -1199,19 +1186,18 @@ fn do_date_time_math( } #[inline] -fn do_date_time_math_array( +fn do_date_time_math_array( secs: i64, nsecs: u32, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result { let prior = NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or_else(|| { DataFusionError::Internal(format!( "Could not conert to NaiveDateTime: secs {secs} nsecs {nsecs}" )) })?; - do_date_math_array(prior, interval, sign, interval_mode) + do_date_math_array::<_, INTERVAL_MODE>(prior, interval, sign) } fn do_date_math(prior: D, scalar: &ScalarValue, sign: i32) -> Result @@ -1228,19 +1214,23 @@ where }) } -fn do_date_math_array( +fn do_date_math_array( prior: D, interval: i128, sign: i32, - interval_mode: IntervalMode, ) -> Result where D: Datelike + Add, { - Ok(match interval_mode { - IntervalMode::DT => add_day_time(prior, interval as i64, sign), - IntervalMode::YM => shift_months(prior, interval as i32 * sign), - IntervalMode::MDN => add_m_d_nano(prior, interval, sign), + Ok(match INTERVAL_MODE { + YM_MODE => shift_months(prior, interval as i32 * sign), + DT_MODE => add_day_time(prior, interval as i64, sign), + MDN_MODE => add_m_d_nano(prior, interval, sign), + _ => { + return Err(DataFusionError::Internal( + "Undefined interval mode for interval calculations".to_string(), + )); + } }) } diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index f2c366007840..add401f66c09 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -1415,191 +1415,6 @@ where Ok(Arc::new(table)) } -fn make_ts_interval_table() -> Result> { - let schema = Arc::new(Schema::new(vec![ - Field::new( - "ts_sec1", - DataType::Timestamp(TimeUnit::Second, None), - false, - ), - Field::new( - "ts_sec2", - DataType::Timestamp(TimeUnit::Second, None), - false, - ), - Field::new( - "ts_millisec1", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - Field::new( - "ts_millisec2", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - Field::new( - "ts_microsec1", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - Field::new( - "ts_microsec2", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - Field::new( - "ts_nanosec1", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new( - "ts_nanosec2", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new( - "interval_ym1", - DataType::Interval(IntervalUnit::YearMonth), - false, - ), - Field::new( - "interval_ym2", - DataType::Interval(IntervalUnit::YearMonth), - false, - ), - Field::new( - "interval_dt1", - DataType::Interval(IntervalUnit::DayTime), - false, - ), - Field::new( - "interval_dt2", - DataType::Interval(IntervalUnit::DayTime), - false, - ), - Field::new( - "interval_mdn1", - DataType::Interval(IntervalUnit::MonthDayNano), - false, - ), - Field::new( - "interval_mdn2", - DataType::Interval(IntervalUnit::MonthDayNano), - false, - ), - Field::new("val", DataType::Int32, true), - ])); - - let ts_sec1 = vec![ - 1_678_893_420i64, //2023-03-15T15:17:00 - 1_688_894_420i64, //2023-07-09T09:20:20 - 1_698_895_420i64, //2023-11-02T03:23:40 - ]; - let ts_sec2 = vec![ - 1_678_892_420i64, //2023-03-15T15:00:20 - 1_688_892_420i64, //2023-07-09T08:47:00 - 1_698_892_420i64, //2023-11-02T02:33:40 - ]; - let ts_millisec1 = vec![ - 1_678_892_420_002i64, //2023-03-15T15:00:20.002 - 1_688_892_420_001i64, //2023-07-09T08:47:00.001 - 1_698_892_420_003i64, //2023-11-02T02:33:40.003 - ]; - let ts_millisec2 = vec![ - 1_678_892_420_000i64, //2023-03-15T15:00:20.000 - 1_688_892_420_000i64, //2023-07-09T08:47:00.000 - 1_698_892_420_000i64, //2023-11-02T02:33:40.000 - ]; - let ts_microsec1 = vec![ - 1_678_892_420_002_010i64, //2023-03-15T15:00:20.002_010 - 1_688_892_420_001_020i64, //2023-07-09T08:47:00.001_020 - 1_698_892_420_003_030i64, //2023-11-02T02:33:40.003_030 - ]; - let ts_microsec2 = vec![ - 1_678_892_420_000_000i64, //2023-03-15T15:00:20.000_000 - 1_688_892_420_000_000i64, //2023-07-09T08:47:00.000_000 - 1_698_892_420_000_000i64, //2023-11-02T02:33:40.000_000 - ]; - let ts_nanosec1 = vec![ - 1_678_892_420_002_200_002i64, //2023-03-15T15:00:20.002_200_002 - 1_688_892_420_001_500_004i64, //2023-07-09T08:47:00.001_500_004 - 1_698_892_420_003_300_003i64, //2023-11-02T02:33:40.003_300_003 - ]; - let ts_nanosec2 = vec![ - 1_678_892_420_000_000_000i64, //2023-03-15T15:00:20.000_000_000 - 1_688_892_420_000_000_000i64, //2023-07-09T08:47:00.000_000_000 - 1_698_892_420_000_000_000i64, //2023-11-02T02:33:40.000_000_000 - ]; - - let intervals_ym1 = vec![11, 8, 23]; - // 11 months, 8 months, 23 months - let intervals_ym2 = vec![3, 7, 5]; - // 3 months, 7 months, 5 months - let intervals_dt1 = vec![4_394_969_299, 4_494_969_298, 4_594_969_297]; - // 1 day 27 hours 46 minutes 42 seconds 3 milliseconds - // 1 day 55 hours 33 minutes 22 seconds 2 milliseconds - // 1 day 83 hours 20 minutes 2 seconds 1 millisecond - let intervals_dt2 = vec![4_294_969_296, 4_294_969_296, 4_294_969_296]; - // 1 day 2 seconds , 1 day 2 seconds , 1 day 2 seconds - let intervals_mdn1 = vec![ - 237_684_487_801_047_429_812_565_639_167, - 396_140_812_755_789_128_704_815_267_841, - 18_446_744_073_709_551_907, - ]; - // 3 months 14 days 65535 nanoseconds - // 5 months 10 days 1 nanosecond - // 0 month 1 day 291 nanoseconds - let intervals_mdn2 = vec![100, 1_000_000, 1_000_000_000]; - // 100 nanoseconds - // 1_000_000 nanoseconds - // 1_000_000_000 nanoseconds - - let array1 = PrimitiveArray::::from_iter_values(ts_sec1); - let array2 = PrimitiveArray::::from_iter_values(ts_sec2); - let array3 = - PrimitiveArray::::from_iter_values(ts_millisec1); - let array4 = - PrimitiveArray::::from_iter_values(ts_millisec2); - let array5 = - PrimitiveArray::::from_iter_values(ts_microsec1); - let array6 = - PrimitiveArray::::from_iter_values(ts_microsec2); - let array7 = PrimitiveArray::::from_iter_values(ts_nanosec1); - let array8 = PrimitiveArray::::from_iter_values(ts_nanosec2); - let array9 = PrimitiveArray::::from_iter_values(intervals_ym1); - let array10 = - PrimitiveArray::::from_iter_values(intervals_ym2); - let array11 = PrimitiveArray::::from_iter_values(intervals_dt1); - let array12 = PrimitiveArray::::from_iter_values(intervals_dt2); - let array13 = - PrimitiveArray::::from_iter_values(intervals_mdn1); - let array14 = - PrimitiveArray::::from_iter_values(intervals_mdn2); - - let data = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(array1), - Arc::new(array2), - Arc::new(array3), - Arc::new(array4), - Arc::new(array5), - Arc::new(array6), - Arc::new(array7), - Arc::new(array8), - Arc::new(array9), - Arc::new(array10), - Arc::new(array11), - Arc::new(array12), - Arc::new(array13), - Arc::new(array14), - Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), - ], - )?; - let table = MemTable::try_new(schema, vec![vec![data]])?; - Ok(Arc::new(table)) -} - fn make_timestamp_nano_table() -> Result> { make_timestamp_table::() } diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs index d87f3d480261..e3c2ef6dff65 100644 --- a/datafusion/core/tests/sql/timestamp.rs +++ b/datafusion/core/tests/sql/timestamp.rs @@ -1692,6 +1692,7 @@ async fn test_ts_dt_binary_ops() -> Result<()> { Ok(()) } +// Cannot remove to sqllogictest, timezone support is not ready there. #[tokio::test] async fn timestamp_sub_with_tz() -> Result<()> { let ctx = SessionContext::new(); @@ -1716,114 +1717,3 @@ async fn timestamp_sub_with_tz() -> Result<()> { Ok(()) } - -#[tokio::test] -async fn interval_sub() -> Result<()> { - let ctx = SessionContext::new(); - let table_a = make_ts_interval_table()?; - ctx.register_table("table_a", table_a)?; - - let sql = "SELECT val, interval_dt1 - interval_dt2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+----------------------------------------------------+", - "| val | interval_diff |", - "+-----+----------------------------------------------------+", - "| 3 | 0 years 0 mons 0 days 83 hours 20 mins 0.001 secs |", - "| 2 | 0 years 0 mons 0 days 55 hours 33 mins 20.002 secs |", - "| 1 | 0 years 0 mons 0 days 27 hours 46 mins 40.003 secs |", - "+-----+----------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_ym1 - interval_ym2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+------------------------------------------------+", - "| val | interval_diff |", - "+-----+------------------------------------------------+", - "| 3 | 1 years 6 mons 0 days 0 hours 0 mins 0.00 secs |", - "| 2 | 0 years 1 mons 0 days 0 hours 0 mins 0.00 secs |", - "| 1 | 0 years 8 mons 0 days 0 hours 0 mins 0.00 secs |", - "+-----+------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_mdn1 - interval_mdn2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+---------------------------------------------------------+", - "| val | interval_diff |", - "+-----+---------------------------------------------------------+", - "| 3 | 0 years 0 mons 1 days 0 hours 0 mins -0.999999709 secs |", - "| 2 | 0 years 5 mons 10 days 0 hours 0 mins -0.000999999 secs |", - "| 1 | 0 years 3 mons 14 days 0 hours 0 mins 0.000065435 secs |", - "+-----+---------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT val, interval_ym1 - interval_dt2 AS interval_diff FROM table_a ORDER BY interval_dt2 - interval_dt1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+----------------------------------------------------------+", - "| val | interval_diff |", - "+-----+----------------------------------------------------------+", - "| 3 | 0 years 23 mons -1 days 0 hours 0 mins -2.000000000 secs |", - "| 2 | 0 years 8 mons -1 days 0 hours 0 mins -2.000000000 secs |", - "| 1 | 0 years 11 mons -1 days 0 hours 0 mins -2.000000000 secs |", - "+-----+----------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - - Ok(()) -} - -#[tokio::test] -async fn ts_interval_sub() -> Result<()> { - let ctx = SessionContext::new(); - let table_a = make_ts_interval_table()?; - ctx.register_table("table_a", table_a)?; - - let sql = "SELECT val, ts_millisec1 - interval_dt1 AS ts_interval_diff, - ts_sec1 - interval_ym1 AS ts_interval_diff2, - ts_sec1 - interval_mdn2 AS ts_interval_diff3, - ts_nanosec1 - interval_dt2 AS ts_interval_diff4 - FROM table_a ORDER BY ts_millisec1 - interval_dt1 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - "| val | ts_interval_diff | ts_interval_diff2 | ts_interval_diff3 | ts_interval_diff4 |", - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - "| 3 | 2023-10-28T15:13:38.002 | 2021-12-02T03:23:40 | 2023-11-02T03:23:39 | 2023-11-01T02:33:38.003300003 |", - "| 2 | 2023-07-06T01:13:37.999 | 2022-11-09T09:20:20 | 2023-07-09T09:20:19 | 2023-07-08T08:46:58.001500004 |", - "| 1 | 2023-03-13T11:13:37.999 | 2022-04-15T15:17:00 | 2023-03-15T15:16:59 | 2023-03-14T15:00:18.002200002 |", - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn interval_ts_add() -> Result<()> { - let ctx = SessionContext::new(); - let table_a = make_ts_interval_table()?; - ctx.register_table("table_a", table_a)?; - - let sql = "SELECT val, interval_dt1 + ts_millisec2 AS interval_sum_ts, - interval_ym2 + ts_sec1 AS interval_sum_ts2, - interval_mdn2 + ts_sec2 AS interval_sum_ts3, - interval_mdn1 + ts_nanosec1 AS interval_sum_ts4 - FROM table_a ORDER BY interval_dt1 + ts_millisec2 DESC"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - "| val | interval_sum_ts | interval_sum_ts2 | interval_sum_ts3 | interval_sum_ts4 |", - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - "| 3 | 2023-11-06T13:53:42.001 | 2024-04-02T03:23:40 | 2023-11-02T02:33:41 | 2023-11-03T02:33:40.003300294 |", - "| 2 | 2023-07-12T16:20:22.002 | 2024-02-09T09:20:20 | 2023-07-09T08:47:00 | 2023-12-19T08:47:00.001500005 |", - "| 1 | 2023-03-17T18:47:02.003 | 2023-06-15T15:17:00 | 2023-03-15T15:00:20 | 2023-06-29T15:00:20.002265537 |", - "+-----+-------------------------+---------------------+---------------------+-------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} diff --git a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt index 25af7b328adf..3abbbaeb72b9 100644 --- a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt +++ b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt @@ -239,26 +239,108 @@ SELECT INTERVAL '8' MONTH + '2000-01-01T00:00:00'::timestamp; ---- 2000-09-01T00:00:00 +# Interval columns are created with timestamp subtraction in subquery since they are not supported yet statement ok -create table foo (val int, ts1 timestamp, ts2 timestamp) as values (1, '2023-03-15T15:00:20'::timestamp, '2023-03-15T15:00:00'::timestamp), (2, '2023-03-15T15:00:10'::timestamp, '2023-03-15T15:00:00'::timestamp); +create table foo (val int, ts1 timestamp, ts2 timestamp) as values +(1, '2023-03-15T15:00:20.000000123'::timestamp, '2023-01-20T23:00:00.000000099'::timestamp), +(2, '2023-02-28T12:01:55.000123456'::timestamp, '2000-02-23T11:00:00.123000001'::timestamp), +(3, '2033-11-02T23:22:13.000123456'::timestamp, '1990-03-01T00:00:00.333000001'::timestamp), +(4, '2003-07-11T01:31:15.000123456'::timestamp, '2045-04-11T15:00:00.000000001'::timestamp); +# Timestamp - Timestamp query I? SELECT val, ts1 - ts2 FROM foo ORDER BY ts2 - ts1; ---- -1 0 years 0 mons 0 days 0 hours 0 mins 20.000000000 secs -2 0 years 0 mons 0 days 0 hours 0 mins 10.000000000 secs +4 0 years 0 mons -15250 days -13 hours -28 mins -44.999876545 secs +3 0 years 0 mons 15952 days 23 hours 22 mins 12.667123455 secs +2 0 years 0 mons 8406 days 1 hours 1 mins 54.877123455 secs +1 0 years 0 mons 53 days 16 hours 0 mins 20.000000024 secs -statement ok -drop table foo; +# Interval - Interval +query ? +SELECT subq.interval1 - subq.interval2 +FROM ( + SELECT ts1 - ts2 AS interval1, + ts2 - ts1 AS interval2 + FROM foo +) AS subq; +---- +0 years 0 mons 106 days 32 hours 0 mins 40.000000048 secs +0 years 0 mons 16812 days 2 hours 3 mins 49.754246910 secs +0 years 0 mons 31904 days 46 hours 44 mins 25.334246910 secs +0 years 0 mons -30500 days -26 hours -57 mins -29.999753090 secs + +# Interval + Interval +query ? +SELECT subq.interval1 + subq.interval2 +FROM ( + SELECT ts1 - ts2 AS interval1, + ts2 - ts1 AS interval2 + FROM foo +) AS subq; +---- +0 years 0 mons 0 days 0 hours 0 mins 0.000000000 secs +0 years 0 mons 0 days 0 hours 0 mins 0.000000000 secs +0 years 0 mons 0 days 0 hours 0 mins 0.000000000 secs +0 years 0 mons 0 days 0 hours 0 mins 0.000000000 secs -statement ok -create table foo (val int, ts1 timestamp, ts2 timestamp) as values (1, '2023-03-15T15:00:20.000000123'::timestamp, '2023-03-15T15:00:00.000000099'::timestamp), (2, '2023-03-15T15:00:10.000123456'::timestamp, '2023-03-15T15:00:00.000000001'::timestamp); +# Timestamp - Interval +query P +SELECT subq.ts1 - subq.interval1 +FROM ( + SELECT ts1, + ts1 - ts2 AS interval1 + FROM foo +) AS subq; +---- +2023-01-20T23:00:00.000000099 +2000-02-23T11:00:00.123000001 +1990-03-01T00:00:00.333000001 +2045-04-11T15:00:00.000000001 -query I? -SELECT val, ts1 - ts2 FROM foo ORDER BY ts1 - ts2; +# Interval + Timestamp +query P +SELECT subq.interval1 + subq.ts1 +FROM ( + SELECT ts1, + ts1 - ts2 AS interval1 + FROM foo +) AS subq; ---- -2 0 years 0 mons 0 days 0 hours 0 mins 10.000123455 secs -1 0 years 0 mons 0 days 0 hours 0 mins 20.000000024 secs +2023-05-08T07:00:40.000000147 +2046-03-05T13:03:49.877246911 +2077-07-07T22:44:25.667246911 +1961-10-08T12:02:30.000246911 + +# Timestamp + Interval +query P +SELECT subq.ts1 + subq.interval1 +FROM ( + SELECT ts1, + ts1 - ts2 AS interval1 + FROM foo +) AS subq; +---- +2023-05-08T07:00:40.000000147 +2046-03-05T13:03:49.877246911 +2077-07-07T22:44:25.667246911 +1961-10-08T12:02:30.000246911 + +# Timestamp + Timestamp => error +statement error DataFusion error: Error during planning: Timestamp\(Nanosecond, None\) Timestamp\(Nanosecond, None\) is an unsupported operation. addition/subtraction on dates/timestamps only supported with interval types +SELECT ts1 + ts2 +FROM foo; + +# Interval - Timestamp => error +statement error DataFusion error: Error during planning: Interval\(MonthDayNano\) - Timestamp\(Nanosecond, None\) can't be evaluated because there isn't a common type to coerce the types to +SELECT subq.interval1 - subq.ts1 +FROM ( + SELECT ts1, + ts1 - ts2 AS interval1 + FROM foo +) AS subq; statement ok -drop table foo; \ No newline at end of file +drop table foo; + + diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index e85bc25f6989..9c6eefe9e7c9 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -149,7 +149,7 @@ pub fn coerce_types( match result { None => Err(DataFusionError::Plan( format!( - "'{lhs_type:?} {op} {rhs_type:?}' can't be evaluated because there isn't a common type to coerce the types to" + "{lhs_type:?} {op} {rhs_type:?} can't be evaluated because there isn't a common type to coerce the types to" ), )), Some(t) => Ok(t) @@ -235,10 +235,8 @@ pub fn temporal_add_sub_coercion( // if two date/timestamp are being added/subtracted, return an error indicating that the operation is not supported (lhs, rhs, _) if (is_date(lhs) || is_timestamp(lhs)) && (is_date(rhs) || is_timestamp(rhs)) => { Err(DataFusionError::Plan(format!( - "'{:?} {} {:?}' is an unsupported operation. \ - addition/subtraction on dates/timestamps only supported with interval types - ", - lhs_type, op, rhs_type + "{:?} {:?} is an unsupported operation. addition/subtraction on dates/timestamps only supported with interval types", + lhs_type, rhs_type ))) } // return None if no coercion is possible @@ -830,7 +828,7 @@ mod tests { coerce_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8); if let Err(DataFusionError::Plan(e)) = result_type { - assert_eq!(e, "'Float32 + Utf8' can't be evaluated because there isn't a common type to coerce the types to"); + assert_eq!(e, "Float32 + Utf8 can't be evaluated because there isn't a common type to coerce the types to"); Ok(()) } else { Err(DataFusionError::Internal( @@ -1011,7 +1009,7 @@ mod tests { let err = coerce_types(&DataType::Date32, &Operator::Plus, &DataType::Date64) .unwrap_err() .to_string(); - assert_contains!(&err, "'Date32 + Date64' is an unsupported operation. addition/subtraction on dates/timestamps only supported with interval types"); + assert_contains!(&err, "Date32 Date64 is an unsupported operation. addition/subtraction on dates/timestamps only supported with interval types"); Ok(()) } diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 680b3718e3af..76d5313ccd79 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -299,13 +299,16 @@ macro_rules! interval_cross_op { }}; } macro_rules! ts_interval_op { - ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty, $mode:expr) => {{ + ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ let prim_array_lhs = $caster1(&$lhs)?; let prim_array_rhs = $caster2(&$rhs)?; let ret = Arc::new(binary::<$type_in1, $type_in2, _, $type_in1>( prim_array_lhs, prim_array_rhs, - |ts, interval| $op(ts, interval as i128, $sign, $mode).unwrap(), + |ts, interval| { + $op(ts, interval as i128, $sign) + .expect("error in {$sign} operation of interval with timestamp") + }, )?) as ArrayRef; ret }}; @@ -628,11 +631,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_ym_array, - seconds_add_array, + seconds_add_array::, sign, TimestampSecondType, - IntervalYearMonthType, - IntervalMode::YM + IntervalYearMonthType )), ( DataType::Timestamp(TimeUnit::Second, _), @@ -642,11 +644,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_dt_array, - seconds_add_array, + seconds_add_array::, sign, TimestampSecondType, - IntervalDayTimeType, - IntervalMode::DT + IntervalDayTimeType )), ( DataType::Timestamp(TimeUnit::Second, _), @@ -656,11 +657,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_second_array, as_interval_mdn_array, - seconds_add_array, + seconds_add_array::, sign, TimestampSecondType, - IntervalMonthDayNanoType, - IntervalMode::MDN + IntervalMonthDayNanoType )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -670,11 +670,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_ym_array, - milliseconds_add_array, + milliseconds_add_array::, sign, TimestampMillisecondType, - IntervalYearMonthType, - IntervalMode::YM + IntervalYearMonthType )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -684,11 +683,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_dt_array, - milliseconds_add_array, + milliseconds_add_array::, sign, TimestampMillisecondType, - IntervalDayTimeType, - IntervalMode::DT + IntervalDayTimeType )), ( DataType::Timestamp(TimeUnit::Millisecond, _), @@ -698,11 +696,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_millisecond_array, as_interval_mdn_array, - milliseconds_add_array, + milliseconds_add_array::, sign, TimestampMillisecondType, - IntervalMonthDayNanoType, - IntervalMode::MDN + IntervalMonthDayNanoType )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -712,11 +709,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_ym_array, - microseconds_add_array, + microseconds_add_array::, sign, TimestampMicrosecondType, - IntervalYearMonthType, - IntervalMode::YM + IntervalYearMonthType )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -726,11 +722,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_dt_array, - microseconds_add_array, + microseconds_add_array::, sign, TimestampMicrosecondType, - IntervalDayTimeType, - IntervalMode::DT + IntervalDayTimeType )), ( DataType::Timestamp(TimeUnit::Microsecond, _), @@ -740,11 +735,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_microsecond_array, as_interval_mdn_array, - microseconds_add_array, + microseconds_add_array::, sign, TimestampMicrosecondType, - IntervalMonthDayNanoType, - IntervalMode::MDN + IntervalMonthDayNanoType )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -754,11 +748,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_ym_array, - nanoseconds_add_array, + nanoseconds_add_array::, sign, TimestampNanosecondType, - IntervalYearMonthType, - IntervalMode::YM + IntervalYearMonthType )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -768,11 +761,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_dt_array, - nanoseconds_add_array, + nanoseconds_add_array::, sign, TimestampNanosecondType, - IntervalDayTimeType, - IntervalMode::DT + IntervalDayTimeType )), ( DataType::Timestamp(TimeUnit::Nanosecond, _), @@ -782,11 +774,10 @@ fn ts_interval_array_op( array_rhs, as_timestamp_nanosecond_array, as_interval_mdn_array, - nanoseconds_add_array, + nanoseconds_add_array::, sign, TimestampNanosecondType, - IntervalMonthDayNanoType, - IntervalMode::MDN + IntervalMonthDayNanoType )), (_, _) => Err(DataFusionError::Execution(format!( "Invalid array types for Timestamp Interval operation: {:?} {} {:?}", From 20b276a527754d68b2cbfa595fefd29672e8347e Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 28 Mar 2023 15:41:44 +0300 Subject: [PATCH 31/55] tz parsing excluded --- datafusion/common/src/scalar.rs | 29 ++++++++++++++----- datafusion/optimizer/src/type_coercion.rs | 4 +-- .../physical-expr/src/expressions/datetime.rs | 13 ++++----- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 76cbb982bec8..9a453c666952 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -981,8 +981,11 @@ fn ts_sub_to_interval( lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, lhs_tz)?; - let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, rhs_tz)?; + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones(lhs_tz), parse_timezones(rhs_tz)); + + let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, &parsed_lhs_tz)?; + let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, &parsed_rhs_tz)?; let delta_secs = lhs_dt.signed_duration_since(rhs_dt); match TIME_MOD { @@ -1008,12 +1011,27 @@ fn ts_sub_to_interval( } } +// This function parses the timezone from string to Tz. +// If it cannot parse or timezone field is [`None`], it returns [`None`]. +pub fn parse_timezones(tz: &Option) -> Option { + if let Some(tz) = tz { + let parsed_tz: Option = FromStr::from_str(tz) + .map_err(|_| { + DataFusionError::Execution("cannot parse given timezone".to_string()) + }) + .ok(); + parsed_tz + } else { + None + } +} + /// This function creates the [`NaiveDateTime`] object corresponding to the /// given timestamp using the units (tick size) implied by argument `mode`. #[inline] pub fn with_timezone_to_naive_datetime( ts: i64, - tz: &Option, + tz: &Option, ) -> Result { let datetime = if TIME_MODE == MILLISECOND_MODE { ticks_to_naive_datetime::<1_000_000>(ts) @@ -1021,10 +1039,7 @@ pub fn with_timezone_to_naive_datetime( ticks_to_naive_datetime::<1>(ts) }?; - if let Some(tz) = tz { - let parsed_tz: Tz = FromStr::from_str(tz).map_err(|_| { - DataFusionError::Execution("cannot parse given timezone".to_string()) - })?; + if let Some(parsed_tz) = tz { let offset = parsed_tz .offset_from_local_datetime(&datetime) .single() diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 437d9cd47d0a..67b81fa79f7e 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -943,7 +943,7 @@ mod test { let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); let err = assert_optimized_plan_eq(&plan, ""); assert!(err.is_err()); - assert!(err.unwrap_err().to_string().contains("'Int64 IS DISTINCT FROM Boolean' can't be evaluated because there isn't a common type to coerce the types to")); + assert!(err.unwrap_err().to_string().contains("Int64 IS DISTINCT FROM Boolean can't be evaluated because there isn't a common type to coerce the types to")); // is not true let expr = col("a").is_not_true(); @@ -1045,7 +1045,7 @@ mod test { let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); let err = assert_optimized_plan_eq(&plan, expected); assert!(err.is_err()); - assert!(err.unwrap_err().to_string().contains("'Utf8 IS NOT DISTINCT FROM Boolean' can't be evaluated because there isn't a common type to coerce the types to")); + assert!(err.unwrap_err().to_string().contains("Utf8 IS NOT DISTINCT FROM Boolean can't be evaluated because there isn't a common type to coerce the types to")); // is not unknown let expr = col("a").is_not_unknown(); diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 76d5313ccd79..c9af95cb8e26 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -259,14 +259,16 @@ macro_rules! ts_sub_op { prim_array_lhs, prim_array_rhs, |ts1, ts2| { + let (lhs_tz, rhs_tz) = + (parse_timezones($lhs_tz), parse_timezones($rhs_tz)); Ok($op( $ts_unit(&with_timezone_to_naive_datetime::<$mode>( ts1.mul_wrapping($coef), - &$lhs_tz, + &lhs_tz, )?), $ts_unit(&with_timezone_to_naive_datetime::<$mode>( ts2.mul_wrapping($coef), - &$rhs_tz, + &rhs_tz, )?), )) }, @@ -302,13 +304,10 @@ macro_rules! ts_interval_op { ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ let prim_array_lhs = $caster1(&$lhs)?; let prim_array_rhs = $caster2(&$rhs)?; - let ret = Arc::new(binary::<$type_in1, $type_in2, _, $type_in1>( + let ret = Arc::new(try_binary_op::<$type_in1, $type_in2, _, $type_in1>( prim_array_lhs, prim_array_rhs, - |ts, interval| { - $op(ts, interval as i128, $sign) - .expect("error in {$sign} operation of interval with timestamp") - }, + |ts, interval| Ok($op(ts, interval as i128, $sign)?), )?) as ArrayRef; ret }}; From ef1c194512b9ea5401090672938ab576257775f0 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 30 Mar 2023 13:53:54 +0300 Subject: [PATCH 32/55] replace try_binary and as_datetime, and keep timezone for ts+interval op --- datafusion/common/src/scalar.rs | 128 ++++++++++++---- datafusion/core/tests/sql/set_variable.rs | 21 ++- .../sqllogictests/test_files/arrow_typeof.slt | 2 +- datafusion/physical-expr/Cargo.toml | 1 + .../physical-expr/src/expressions/datetime.rs | 145 ++++++------------ 5 files changed, 157 insertions(+), 140 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 9a453c666952..8f034e716f62 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -975,20 +975,20 @@ pub const NANOSECOND_MODE: bool = true; /// interval will have the type [`IntervalDayTimeType`]. /// - When subtracting timestamps at microseconds/nanoseconds precision, the /// output interval will have the type [`IntervalMonthDayNanoType`]. -fn ts_sub_to_interval( +fn ts_sub_to_interval( lhs_ts: i64, rhs_ts: i64, lhs_tz: &Option, rhs_tz: &Option, ) -> Result { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones(lhs_tz), parse_timezones(rhs_tz)); + let parsed_lhs_tz = parse_timezones(lhs_tz)?; + let parsed_rhs_tz = parse_timezones(rhs_tz)?; - let lhs_dt = with_timezone_to_naive_datetime::(lhs_ts, &parsed_lhs_tz)?; - let rhs_dt = with_timezone_to_naive_datetime::(rhs_ts, &parsed_rhs_tz)?; - let delta_secs = lhs_dt.signed_duration_since(rhs_dt); + let (naive_lhs, naive_rhs) = + calculate_naives::(lhs_ts, parsed_lhs_tz, rhs_ts, parsed_rhs_tz)?; + let delta_secs = naive_lhs.signed_duration_since(naive_rhs); - match TIME_MOD { + match TIME_MODE { MILLISECOND_MODE => { let as_millisecs = delta_secs.num_milliseconds(); Ok(ScalarValue::new_interval_dt( @@ -1011,18 +1011,82 @@ fn ts_sub_to_interval( } } -// This function parses the timezone from string to Tz. -// If it cannot parse or timezone field is [`None`], it returns [`None`]. -pub fn parse_timezones(tz: &Option) -> Option { +/// This function parses the timezone from string to Tz. +/// If it cannot parse or timezone field is [`None`], it returns [`None`]. +pub fn parse_timezones(tz: &Option) -> Result> { if let Some(tz) = tz { - let parsed_tz: Option = FromStr::from_str(tz) - .map_err(|_| { - DataFusionError::Execution("cannot parse given timezone".to_string()) - }) - .ok(); - parsed_tz + let parsed_tz: Tz = FromStr::from_str(tz).map_err(|_| { + DataFusionError::Execution("cannot parse given timezone".to_string()) + })?; + Ok(Some(parsed_tz)) } else { - None + Ok(None) + } +} + +/// This function takes two timestamps with an optional timezone, +/// and returns the duration between them. If one of the timestamps +/// has a [`None`] timezone, the other one is also treated as having [`None`]. +pub fn calculate_naives( + lhs_ts: i64, + parsed_lhs_tz: Option, + rhs_ts: i64, + parsed_rhs_tz: Option, +) -> Result<(NaiveDateTime, NaiveDateTime)> { + let err = || { + DataFusionError::Execution(String::from( + "error while converting Int64 to DateTime in timestamp subtraction", + )) + }; + match (parsed_lhs_tz, parsed_rhs_tz, TIME_MODE) { + (Some(lhs_tz), Some(rhs_tz), MILLISECOND_MODE) => { + let lhs = arrow_array::temporal_conversions::as_datetime_with_timezone::< + arrow_array::types::TimestampMillisecondType, + >(lhs_ts, rhs_tz) + .ok_or_else(err)? + .naive_local(); + let rhs = arrow_array::temporal_conversions::as_datetime_with_timezone::< + arrow_array::types::TimestampMillisecondType, + >(rhs_ts, lhs_tz) + .ok_or_else(err)? + .naive_local(); + Ok((lhs, rhs)) + } + (Some(lhs_tz), Some(rhs_tz), NANOSECOND_MODE) => { + let lhs = arrow_array::temporal_conversions::as_datetime_with_timezone::< + arrow_array::types::TimestampNanosecondType, + >(lhs_ts, rhs_tz) + .ok_or_else(err)? + .naive_local(); + let rhs = arrow_array::temporal_conversions::as_datetime_with_timezone::< + arrow_array::types::TimestampNanosecondType, + >(rhs_ts, lhs_tz) + .ok_or_else(err)? + .naive_local(); + Ok((lhs, rhs)) + } + (_, _, MILLISECOND_MODE) => { + let lhs = arrow_array::temporal_conversions::as_datetime::< + arrow_array::types::TimestampMillisecondType, + >(lhs_ts) + .ok_or_else(err)?; + let rhs = arrow_array::temporal_conversions::as_datetime::< + arrow_array::types::TimestampMillisecondType, + >(rhs_ts) + .ok_or_else(err)?; + Ok((lhs, rhs)) + } + (_, _, NANOSECOND_MODE) => { + let lhs = arrow_array::temporal_conversions::as_datetime::< + arrow_array::types::TimestampNanosecondType, + >(lhs_ts) + .ok_or_else(err)?; + let rhs = arrow_array::temporal_conversions::as_datetime::< + arrow_array::types::TimestampNanosecondType, + >(rhs_ts) + .ok_or_else(err)?; + Ok((lhs, rhs)) + } } } @@ -1112,9 +1176,13 @@ pub fn milliseconds_add_array( interval: i128, sign: i32, ) -> Result { - let secs = ts_ms / 1000; - let nsecs = ((ts_ms % 1000) * 1_000_000) as u32; - do_date_time_math_array::(secs, nsecs, interval, sign) + let mut secs = ts_ms / 1000; + let mut nsecs = ((ts_ms % 1000) * 1_000_000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_millis()) } @@ -1131,9 +1199,13 @@ pub fn microseconds_add_array( interval: i128, sign: i32, ) -> Result { - let secs = ts_us / 1_000_000; - let nsecs = ((ts_us % 1_000_000) * 1000) as u32; - do_date_time_math_array::(secs, nsecs, interval, sign) + let mut secs = ts_us / 1_000_000; + let mut nsecs = ((ts_us % 1_000_000) * 1000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_nanos() / 1000) } @@ -1150,9 +1222,13 @@ pub fn nanoseconds_add_array( interval: i128, sign: i32, ) -> Result { - let secs = ts_ns / 1_000_000_000; - let nsecs = (ts_ns % 1_000_000_000) as u32; - do_date_time_math_array::(secs, nsecs, interval, sign) + let mut secs = ts_ns / 1_000_000_000; + let mut nsecs = (ts_ns % 1_000_000_000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_nanos()) } diff --git a/datafusion/core/tests/sql/set_variable.rs b/datafusion/core/tests/sql/set_variable.rs index b7161eb2b162..1ffe43aaf66e 100644 --- a/datafusion/core/tests/sql/set_variable.rs +++ b/datafusion/core/tests/sql/set_variable.rs @@ -427,7 +427,10 @@ async fn set_time_zone_bad_time_zone_format() { .unwrap(); let err = pretty_format_batches(&result).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"08:00\": only offset based timezones supported without chrono-tz feature"); + assert_eq!( + err, + "Parser error: Invalid timezone \"08:00\": '08:00' is not a valid timezone" + ); plan_and_collect(&ctx, "SET TIME ZONE = '08'") .await @@ -440,22 +443,16 @@ async fn set_time_zone_bad_time_zone_format() { .unwrap(); let err = pretty_format_batches(&result).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"08\": only offset based timezones supported without chrono-tz feature"); + assert_eq!( + err, + "Parser error: Invalid timezone \"08\": '08' is not a valid timezone" + ); // we dont support named time zone yet plan_and_collect(&ctx, "SET TIME ZONE = 'Asia/Taipei'") .await .unwrap(); - // casting UTF-8 to TimestampTZ isn't supported yet, add Timestamp as the middle layer for now - let result = - plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") - .await - .unwrap(); - - let err = pretty_format_batches(&result).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"Asia/Taipei\": only offset based timezones supported without chrono-tz feature"); - // this is invalid even after we support named time zone plan_and_collect(&ctx, "SET TIME ZONE = 'Asia/Taipei2'") .await @@ -467,5 +464,5 @@ async fn set_time_zone_bad_time_zone_format() { .await .unwrap(); let err = pretty_format_batches(&result).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"Asia/Taipei2\": only offset based timezones supported without chrono-tz feature"); + assert_eq!(err, "Parser error: Invalid timezone \"Asia/Taipei2\": 'Asia/Taipei2' is not a valid timezone"); } diff --git a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt index fee24740a6f1..5e729ddb9556 100644 --- a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt +++ b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt @@ -279,7 +279,7 @@ query error Cannot automatically convert Interval\(DayTime\) to Interval\(MonthD --- select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)'); -query error DataFusion error: Error during planning: Cannot automatically convert Utf8 to Interval\(MonthDayNano\) +query error DataFusion error: Arrow error: Cast error: Casting from Utf8 to Interval\(MonthDayNano\) not supported select arrow_cast('30 minutes', 'Interval(MonthDayNano)'); diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index e21569ce83f1..bbb3067287a9 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -44,6 +44,7 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } arrow = { workspace = true } +arrow-array = { version = "34.0.0", default-features = false, features = ["chrono-tz"] } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } blake2 = { version = "^0.10.2", optional = true } diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index c9af95cb8e26..518a28268765 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -17,7 +17,7 @@ use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; -use arrow::array::{Array, ArrayData, ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, PrimitiveArray}; use arrow::compute::{binary, unary}; use arrow::datatypes::{ ArrowNativeTypeOp, DataType, Date32Type, Date64Type, IntervalDayTimeType, @@ -26,9 +26,7 @@ use arrow::datatypes::{ TimestampSecondType, }; use arrow::record_batch::RecordBatch; -use arrow::util::bit_mask::combine_option_bitmap; -use arrow_buffer::Buffer; -use arrow_schema::{ArrowError, IntervalUnit}; +use arrow_schema::IntervalUnit; use chrono::NaiveDateTime; use datafusion_common::cast::*; use datafusion_common::scalar::*; @@ -252,28 +250,22 @@ pub fn evaluate_array( } macro_rules! ts_sub_op { - ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_in:ty, $type_out:ty) => {{ + ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_out:ty) => {{ let prim_array_lhs = $caster(&$lhs)?; let prim_array_rhs = $caster(&$rhs)?; - let ret = Arc::new(try_binary_op::<$type_in, $type_in, _, $type_out>( - prim_array_lhs, - prim_array_rhs, - |ts1, ts2| { - let (lhs_tz, rhs_tz) = - (parse_timezones($lhs_tz), parse_timezones($rhs_tz)); - Ok($op( - $ts_unit(&with_timezone_to_naive_datetime::<$mode>( - ts1.mul_wrapping($coef), - &lhs_tz, - )?), - $ts_unit(&with_timezone_to_naive_datetime::<$mode>( - ts2.mul_wrapping($coef), - &rhs_tz, - )?), - )) - }, - )?) as ArrayRef; - ret + let ret: PrimitiveArray<$type_out> = + arrow::compute::try_binary(prim_array_lhs, prim_array_rhs, |ts1, ts2| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones($lhs_tz)?, parse_timezones($rhs_tz)?); + let (naive_lhs, naive_rhs) = calculate_naives::<$mode>( + ts1.mul_wrapping($coef), + parsed_lhs_tz, + ts2.mul_wrapping($coef), + parsed_rhs_tz, + )?; + Ok($op($ts_unit(&naive_lhs), $ts_unit(&naive_rhs))) + })?; + Arc::new(ret) as ArrayRef }}; } macro_rules! interval_op { @@ -301,15 +293,15 @@ macro_rules! interval_cross_op { }}; } macro_rules! ts_interval_op { - ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ + ($lhs:ident, $rhs:ident, $tz:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ let prim_array_lhs = $caster1(&$lhs)?; let prim_array_rhs = $caster2(&$rhs)?; - let ret = Arc::new(try_binary_op::<$type_in1, $type_in2, _, $type_in1>( + let ret: PrimitiveArray<$type_in1> = arrow::compute::try_binary( prim_array_lhs, prim_array_rhs, |ts, interval| Ok($op(ts, interval as i128, $sign)?), - )?) as ArrayRef; - ret + )?; + Arc::new(ret.with_timezone_opt($tz.clone())) as ArrayRef }}; } // This function evaluates temporal array operations, such as timestamp - timestamp, interval + interval, @@ -348,63 +340,6 @@ pub fn evaluate_temporal_arrays( Ok(ColumnarValue::Array(ret)) } -#[inline] -unsafe fn build_primitive_array( - len: usize, - buffer: Buffer, - null_count: usize, - null_buffer: Option, -) -> PrimitiveArray { - PrimitiveArray::from(ArrayData::new_unchecked( - O::DATA_TYPE, - len, - Some(null_count), - null_buffer, - 0, - vec![buffer], - vec![], - )) -} - -pub fn try_binary_op( - a: &PrimitiveArray, - b: &PrimitiveArray, - op: F, -) -> Result, ArrowError> -where - A: ArrowPrimitiveType, - B: ArrowPrimitiveType, - O: ArrowPrimitiveType, - F: Fn(A::Native, B::Native) -> Result, -{ - if a.len() != b.len() { - return Err(ArrowError::ComputeError( - "Cannot perform binary operation on arrays of different length".to_string(), - )); - } - let len = a.len(); - - if a.is_empty() { - return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); - } - - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits_offset(0, len)) - .unwrap_or_default(); - - let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size from a PrimitiveArray - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; - - Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) -} - /// Performs a timestamp subtraction operation on two arrays and returns the resulting array. fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { match (array_lhs.data_type(), array_rhs.data_type()) { @@ -421,7 +356,6 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { seconds_sub, NaiveDateTime::timestamp, MILLISECOND_MODE, - TimestampSecondType, IntervalDayTimeType )), ( @@ -437,7 +371,6 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { milliseconds_sub, NaiveDateTime::timestamp_millis, MILLISECOND_MODE, - TimestampMillisecondType, IntervalDayTimeType )), ( @@ -453,7 +386,6 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { microseconds_sub, NaiveDateTime::timestamp_micros, NANOSECOND_MODE, - TimestampMicrosecondType, IntervalMonthDayNanoType )), ( @@ -469,7 +401,6 @@ fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { nanoseconds_sub, NaiveDateTime::timestamp_nanos, NANOSECOND_MODE, - TimestampNanosecondType, IntervalMonthDayNanoType )), (_, _) => Err(DataFusionError::Execution(format!( @@ -623,11 +554,12 @@ fn ts_interval_array_op( ) -> Result { match (array_lhs.data_type(), array_rhs.data_type()) { ( - DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, tz), DataType::Interval(IntervalUnit::YearMonth), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_second_array, as_interval_ym_array, seconds_add_array::, @@ -636,11 +568,12 @@ fn ts_interval_array_op( IntervalYearMonthType )), ( - DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, tz), DataType::Interval(IntervalUnit::DayTime), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_second_array, as_interval_dt_array, seconds_add_array::, @@ -649,11 +582,12 @@ fn ts_interval_array_op( IntervalDayTimeType )), ( - DataType::Timestamp(TimeUnit::Second, _), + DataType::Timestamp(TimeUnit::Second, tz), DataType::Interval(IntervalUnit::MonthDayNano), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_second_array, as_interval_mdn_array, seconds_add_array::, @@ -662,11 +596,12 @@ fn ts_interval_array_op( IntervalMonthDayNanoType )), ( - DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, tz), DataType::Interval(IntervalUnit::YearMonth), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_millisecond_array, as_interval_ym_array, milliseconds_add_array::, @@ -675,11 +610,12 @@ fn ts_interval_array_op( IntervalYearMonthType )), ( - DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, tz), DataType::Interval(IntervalUnit::DayTime), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_millisecond_array, as_interval_dt_array, milliseconds_add_array::, @@ -688,11 +624,12 @@ fn ts_interval_array_op( IntervalDayTimeType )), ( - DataType::Timestamp(TimeUnit::Millisecond, _), + DataType::Timestamp(TimeUnit::Millisecond, tz), DataType::Interval(IntervalUnit::MonthDayNano), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_millisecond_array, as_interval_mdn_array, milliseconds_add_array::, @@ -701,11 +638,12 @@ fn ts_interval_array_op( IntervalMonthDayNanoType )), ( - DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, tz), DataType::Interval(IntervalUnit::YearMonth), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_microsecond_array, as_interval_ym_array, microseconds_add_array::, @@ -714,11 +652,12 @@ fn ts_interval_array_op( IntervalYearMonthType )), ( - DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, tz), DataType::Interval(IntervalUnit::DayTime), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_microsecond_array, as_interval_dt_array, microseconds_add_array::, @@ -727,11 +666,12 @@ fn ts_interval_array_op( IntervalDayTimeType )), ( - DataType::Timestamp(TimeUnit::Microsecond, _), + DataType::Timestamp(TimeUnit::Microsecond, tz), DataType::Interval(IntervalUnit::MonthDayNano), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_microsecond_array, as_interval_mdn_array, microseconds_add_array::, @@ -740,11 +680,12 @@ fn ts_interval_array_op( IntervalMonthDayNanoType )), ( - DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, tz), DataType::Interval(IntervalUnit::YearMonth), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_nanosecond_array, as_interval_ym_array, nanoseconds_add_array::, @@ -753,11 +694,12 @@ fn ts_interval_array_op( IntervalYearMonthType )), ( - DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, tz), DataType::Interval(IntervalUnit::DayTime), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_nanosecond_array, as_interval_dt_array, nanoseconds_add_array::, @@ -766,11 +708,12 @@ fn ts_interval_array_op( IntervalDayTimeType )), ( - DataType::Timestamp(TimeUnit::Nanosecond, _), + DataType::Timestamp(TimeUnit::Nanosecond, tz), DataType::Interval(IntervalUnit::MonthDayNano), ) => Ok(ts_interval_op!( array_lhs, array_rhs, + tz, as_timestamp_nanosecond_array, as_interval_mdn_array, nanoseconds_add_array::, From 21e1df87090a190514be230c2fddf678fa70cc78 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 30 Mar 2023 14:14:41 +0300 Subject: [PATCH 33/55] fix after merge --- datafusion/core/tests/sql/mod.rs | 14 +++----------- datafusion/core/tests/sql/set_variable.rs | 8 ++++++++ .../sqllogictests/test_files/arrow_typeof.slt | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 9027ebd0d960..00918638e916 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -1339,20 +1339,12 @@ where A: ArrowTimestampType, { let schema = Arc::new(Schema::new(vec![ - Field::new( - "ts1", - DataType::Timestamp(A::get_time_unit(), tz1.clone()), - false, - ), - Field::new( - "ts2", - DataType::Timestamp(A::get_time_unit(), tz2.clone()), - false, - ), + Field::new("ts1", DataType::Timestamp(A::UNIT, tz1.clone()), false), + Field::new("ts2", DataType::Timestamp(A::UNIT, tz2.clone()), false), Field::new("val", DataType::Int32, true), ])); - let divisor = match A::get_time_unit() { + let divisor = match A::UNIT { TimeUnit::Nanosecond => 1, TimeUnit::Microsecond => 1000, TimeUnit::Millisecond => 1_000_000, diff --git a/datafusion/core/tests/sql/set_variable.rs b/datafusion/core/tests/sql/set_variable.rs index 39e209209f6a..b89264ebbab9 100644 --- a/datafusion/core/tests/sql/set_variable.rs +++ b/datafusion/core/tests/sql/set_variable.rs @@ -453,6 +453,14 @@ async fn set_time_zone_bad_time_zone_format() { .await .unwrap(); + // casting UTF-8 to TimestampTZ isn't supported yet, add Timestamp as the middle layer for now + let result = + plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") + .await + .unwrap(); + let batch_pretty = pretty_format_batches(&result).unwrap().to_string(); + assert_eq!(batch_pretty, "+-----------------------------+\n| Utf8(\"2000-01-01T00:00:00\") |\n+-----------------------------+\n| 2000-01-01T08:00:00+08:00 |\n+-----------------------------+"); + // this is invalid even after we support named time zone plan_and_collect(&ctx, "SET TIME ZONE = 'Asia/Taipei2'") .await diff --git a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt index 5e729ddb9556..94b954bfde1e 100644 --- a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt +++ b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt @@ -279,7 +279,7 @@ query error Cannot automatically convert Interval\(DayTime\) to Interval\(MonthD --- select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)'); -query error DataFusion error: Arrow error: Cast error: Casting from Utf8 to Interval\(MonthDayNano\) not supported +query error DataFusion error: This feature is not implemented: Can't create a scalar from array of type "Interval\(MonthDayNano\)" select arrow_cast('30 minutes', 'Interval(MonthDayNano)'); From b20eb77db59205eb114a0435ffa381ab5cd0b38b Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 30 Mar 2023 14:36:00 +0300 Subject: [PATCH 34/55] delete unused functions --- datafusion-cli/Cargo.lock | 384 ++++++++++++++++++++------------ datafusion/common/src/scalar.rs | 46 +--- 2 files changed, 241 insertions(+), 189 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 49e2478d842d..e8f97824b6c7 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -74,16 +74,16 @@ checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" dependencies = [ "ahash", "arrow-arith", - "arrow-array", - "arrow-buffer", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", "arrow-cast", "arrow-csv", - "arrow-data", + "arrow-data 36.0.0", "arrow-ipc", "arrow-json", "arrow-ord", "arrow-row", - "arrow-schema", + "arrow-schema 36.0.0", "arrow-select", "arrow-string", ] @@ -94,15 +94,32 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "chrono", "half", "num", ] +[[package]] +name = "arrow-array" +version = "34.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d35d5475e65c57cffba06d0022e3006b677515f99b54af33a7cd54f6cdd4a5b5" +dependencies = [ + "ahash", + "arrow-buffer 34.0.0", + "arrow-data 34.0.0", + "arrow-schema 34.0.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.13.2", + "num", +] + [[package]] name = "arrow-array" version = "36.0.0" @@ -110,9 +127,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "chrono", "chrono-tz", "half", @@ -120,6 +137,16 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "34.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b4ec72eda7c0207727df96cf200f539749d736b21f3e782ece113e18c1a0a7" +dependencies = [ + "half", + "num", +] + [[package]] name = "arrow-buffer" version = "36.0.0" @@ -136,10 +163,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "arrow-select", "chrono", "comfy-table", @@ -153,11 +180,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" dependencies = [ - "arrow-array", - "arrow-buffer", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "chrono", "csv", "csv-core", @@ -166,14 +193,26 @@ dependencies = [ "regex", ] +[[package]] +name = "arrow-data" +version = "34.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27cc673ee6989ea6e4b4e8c7d461f7e06026a096c8f0b1a7288885ff71ae1e56" +dependencies = [ + "arrow-buffer 34.0.0", + "arrow-schema 34.0.0", + "half", + "num", +] + [[package]] name = "arrow-data" version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 36.0.0", + "arrow-schema 36.0.0", "half", "num", ] @@ -184,11 +223,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" dependencies = [ - "arrow-array", - "arrow-buffer", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "flatbuffers", ] @@ -198,11 +237,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" dependencies = [ - "arrow-array", - "arrow-buffer", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "chrono", "half", "indexmap", @@ -217,10 +256,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "arrow-select", "half", "num", @@ -233,14 +272,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "half", "hashbrown 0.13.2", ] +[[package]] +name = "arrow-schema" +version = "34.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64951898473bfb8e22293e83a44f02874d2257514d49cd95f9aa4afcff183fbc" + [[package]] name = "arrow-schema" version = "36.0.0" @@ -253,10 +298,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "num", ] @@ -266,10 +311,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", + "arrow-data 36.0.0", + "arrow-schema 36.0.0", "arrow-select", "regex", "regex-syntax", @@ -295,13 +340,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.67" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ea188f25f0255d8f92797797c97ebf5631fa88178beb1a46fdf5622c9a00e4" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.8", + "syn 2.0.11", ] [[package]] @@ -582,9 +627,9 @@ checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" dependencies = [ "libc", ] @@ -637,9 +682,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c00419335c41018365ddf7e4d5f1c12ee3659ddcf3e01974650ba1de73d038" +checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" dependencies = [ "cc", "cxxbridge-flags", @@ -649,9 +694,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb8307ad413a98fff033c8545ecf133e3257747b3bae935e7602aab8aa92d4ca" +checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" dependencies = [ "cc", "codespan-reporting", @@ -659,24 +704,24 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn 2.0.8", + "syn 2.0.11", ] [[package]] name = "cxxbridge-flags" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edc52e2eb08915cb12596d29d55f0b5384f00d697a646dbd269b6ecb0fbd9d31" +checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" [[package]] name = "cxxbridge-macro" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "631569015d0d8d54e6c241733f944042623ab6df7bc3be7466874b05fcdb1c5f" +checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.8", + "syn 2.0.11", ] [[package]] @@ -761,7 +806,7 @@ name = "datafusion-common" version = "21.0.0" dependencies = [ "arrow", - "arrow-array", + "arrow-array 36.0.0", "chrono", "num_cpus", "object_store", @@ -817,8 +862,9 @@ version = "21.0.0" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", + "arrow-array 34.0.0", + "arrow-buffer 36.0.0", + "arrow-schema 36.0.0", "blake2", "blake3", "chrono", @@ -854,7 +900,7 @@ dependencies = [ name = "datafusion-sql" version = "21.0.0" dependencies = [ - "arrow-schema", + "arrow-schema 36.0.0", "datafusion-common", "datafusion-expr", "log", @@ -955,13 +1001,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "50d6a0976c999d473fe89ad888d5a284e55366d9dc9038b1ba2aa15128c4afa0" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys", ] [[package]] @@ -995,13 +1041,13 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ef1a30ae415c3a691a4f41afddc2dbcd6d70baf338368d85ebc1e8ed92cedb9" +checksum = "9799aefb4a2e4a01cc47610b1dd47c18ab13d991f27bbcaed9296f5a53d5cbad" dependencies = [ "cfg-if", "rustix", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -1136,9 +1182,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1314,9 +1360,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.54" +version = "0.1.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c17cc76786e99f8d2f055c11159e7f0091c42474dcc3189fbab96072e873e6d" +checksum = "716f12fbcfac6ffab0a5e9ec51d0a0ff70503742bb2dc7b99396394c9dc323f0" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1348,9 +1394,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown 0.12.3", @@ -1379,14 +1425,14 @@ checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" dependencies = [ "hermit-abi 0.3.1", "libc", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] name = "ipnet" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" [[package]] name = "itertools" @@ -1524,9 +1570,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "cd550e73688e6d578f0ac2119e32b797a327631a42f9433e59d02e139c8df60d" [[package]] name = "lock_api" @@ -1626,7 +1672,7 @@ dependencies = [ "libc", "log", "wasi", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -1804,9 +1850,9 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -1816,12 +1862,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", + "arrow-array 36.0.0", + "arrow-buffer 36.0.0", "arrow-cast", - "arrow-data", + "arrow-data 36.0.0", "arrow-ipc", - "arrow-schema", + "arrow-schema 36.0.0", "arrow-select", "base64", "brotli", @@ -1967,9 +2013,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.53" +version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba466839c78239c09faf015484e5cc04860f88242cff4d03eb038f04b4699b73" +checksum = "e472a104799c74b514a57226160104aa483546de37e839ec50e3c2e41dd87534" dependencies = [ "unicode-ident", ] @@ -2042,6 +2088,15 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "redox_users" version = "0.4.3" @@ -2049,15 +2104,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", - "redox_syscall", + "redox_syscall 0.2.16", "thiserror", ] [[package]] name = "regex" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cce168fea28d3e05f158bda4576cf0c844d5045bc2cc3620fa0292ed5bb5814c" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", @@ -2072,9 +2127,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "reqwest" -version = "0.11.15" +version = "0.11.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ba30cc2c0cd02af1222ed216ba659cdb2f879dfe3181852fe7c50b1d0005949" +checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" dependencies = [ "base64", "bytes", @@ -2137,16 +2192,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.11" +version = "0.37.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e" +checksum = "0e78cc525325c06b4a7ff02db283472f3c042b7ff0c391f96c6d5ac6f4f91b75" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -2250,29 +2305,29 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.158" +version = "1.0.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9" +checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.158" +version = "1.0.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e801c1712f48475582b7696ac71e0ca34ebb30e09338425384269d9717c62cad" +checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585" dependencies = [ "proc-macro2", "quote", - "syn 2.0.8", + "syn 2.0.11", ] [[package]] name = "serde_json" -version = "1.0.94" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" +checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744" dependencies = [ "itoa", "ryu", @@ -2444,9 +2499,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.8" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc02725fd69ab9f26eab07fad303e2497fad6fb9eba4f96c4d1687bdf704ad9" +checksum = "21e3787bb71465627110e7d87ed4faaa36c1f61042ee67badb9e2ef173accc40" dependencies = [ "proc-macro2", "quote", @@ -2455,15 +2510,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -2498,7 +2553,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.8", + "syn 2.0.11", ] [[package]] @@ -2538,32 +2593,31 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.26.0" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64" +checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "num_cpus", "parking_lot", "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.11", ] [[package]] @@ -2900,50 +2954,50 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" -version = "0.46.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdacb41e6a96a052c6cb63a144f24900236121c6f63f4f8219fef5977ecb0c25" +checksum = "2649ff315bee4c98757f15dac226efe3d81927adbb6e882084bb1ee3e0c330a7" dependencies = [ - "windows-targets", + "windows-targets 0.47.0", ] [[package]] name = "windows-sys" -version = "0.42.0" +version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows-targets 0.42.2", ] [[package]] -name = "windows-sys" -version = "0.45.0" +name = "windows-targets" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows-targets", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "2f8996d3f43b4b2d44327cd71b7b0efd1284ab60e6e9d0e8b630e18555d87d3e" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.47.0", + "windows_aarch64_msvc 0.47.0", + "windows_i686_gnu 0.47.0", + "windows_i686_msvc 0.47.0", + "windows_x86_64_gnu 0.47.0", + "windows_x86_64_gnullvm 0.47.0", + "windows_x86_64_msvc 0.47.0", ] [[package]] @@ -2952,42 +3006,84 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831d567d53d4f3cb1db332b68e6e2b6260228eb4d99a777d8b2e8ed794027c90" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +[[package]] +name = "windows_aarch64_msvc" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a42d54a417c60ce4f0e31661eed628f0fa5aca73448c093ec4d45fab4c51cdf" + [[package]] name = "windows_i686_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +[[package]] +name = "windows_i686_gnu" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1925beafdbb22201a53a483db861a5644123157c1c3cee83323a2ed565d71e3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +[[package]] +name = "windows_i686_msvc" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8ef8f2f1711b223947d9b69b596cf5a4e452c930fb58b6fc3fdae7d0ec6b31" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +[[package]] +name = "windows_x86_64_gnu" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7acaa0c2cf0d2ef99b61c308a0c3dbae430a51b7345dedec470bd8f53f5a3642" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a0628f71be1d11e17ca4a0e9e15b3a5180f6fbf1c2d55e3ba3f850378052c1" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +[[package]] +name = "windows_x86_64_msvc" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d6e62c256dc6d40b8c8707df17df8d774e60e39db723675241e7c15e910bce7" + [[package]] name = "winreg" version = "0.10.1" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 460b99e44c58..8b55b0a79bf6 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -44,7 +44,7 @@ use arrow::{ }, }; use arrow_array::timezone::Tz; -use chrono::{DateTime, Datelike, Duration, NaiveDate, NaiveDateTime, TimeZone}; +use chrono::{Datelike, Duration, NaiveDate, NaiveDateTime}; // Constants we use throughout this file: const MILLISECS_IN_ONE_DAY: i64 = 86_400_000; @@ -1090,50 +1090,6 @@ pub fn calculate_naives( } } -/// This function creates the [`NaiveDateTime`] object corresponding to the -/// given timestamp using the units (tick size) implied by argument `mode`. -#[inline] -pub fn with_timezone_to_naive_datetime( - ts: i64, - tz: &Option, -) -> Result { - let datetime = if TIME_MODE == MILLISECOND_MODE { - ticks_to_naive_datetime::<1_000_000>(ts) - } else { - ticks_to_naive_datetime::<1>(ts) - }?; - - if let Some(parsed_tz) = tz { - let offset = parsed_tz - .offset_from_local_datetime(&datetime) - .single() - .ok_or_else(|| { - DataFusionError::Execution( - "error conversion result of timezone offset".to_string(), - ) - })?; - return Ok(DateTime::::from_local(datetime, offset).naive_utc()); - } - Ok(datetime) -} - -/// This function creates the [`NaiveDateTime`] object corresponding to the -/// given timestamp, whose tick size is specified by `UNIT_NANOS`. -#[inline] -fn ticks_to_naive_datetime(ticks: i64) -> Result { - let mut secs: i64 = ((ticks as i128 * UNIT_NANOS) / 1_000_000_000) as i64; - let mut nsecs: i32 = ((ticks as i128 * UNIT_NANOS) % 1_000_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } - NaiveDateTime::from_timestamp_opt(secs, nsecs as u32).ok_or_else(|| { - DataFusionError::Execution( - "Can not convert given timestamp to a NaiveDateTime".to_string(), - ) - }) -} - #[inline] pub fn date32_add(days: i32, scalar: &ScalarValue, sign: i32) -> Result { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); From 653d8f84ba30b3e6fb24de19a74f94a7ed699d46 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 3 Apr 2023 15:22:48 +0300 Subject: [PATCH 35/55] ready to review --- datafusion/common/src/scalar.rs | 72 ++++- .../joins/symmetric_hash_join.rs | 244 +++++++++++++++- .../sqllogictests/test_files/arrow_typeof.slt | 2 +- .../physical-expr/src/aggregate/min_max.rs | 96 +++++++ .../physical-expr/src/expressions/binary.rs | 265 +++++++++++++++++- .../physical-expr/src/expressions/datetime.rs | 121 ++++---- .../physical-expr/src/intervals/cp_solver.rs | 18 +- .../src/intervals/interval_aritmetic.rs | 24 +- .../physical-expr/src/intervals/test_utils.rs | 131 ++++++++- 9 files changed, 898 insertions(+), 75 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 8b55b0a79bf6..dfcb305ef11a 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -1016,7 +1016,7 @@ fn ts_sub_to_interval( pub fn parse_timezones(tz: &Option) -> Result> { if let Some(tz) = tz { let parsed_tz: Tz = FromStr::from_str(tz).map_err(|_| { - DataFusionError::Execution("cannot parse given timezone".to_string()) + DataFusionError::Execution(format!("cannot parse '{tz}' as timezone")) })?; Ok(Some(parsed_tz)) } else { @@ -1121,9 +1121,13 @@ pub fn seconds_add_array( #[inline] pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result { - let secs = ts_ms / 1000; - let nsecs = ((ts_ms % 1000) * 1_000_000) as u32; - do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_millis()) + let mut secs = ts_ms / 1000; + let mut nsecs = ((ts_ms % 1000) * 1_000_000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math(secs, nsecs as u32, scalar, sign).map(|dt| dt.timestamp_millis()) } #[inline] @@ -1144,9 +1148,14 @@ pub fn milliseconds_add_array( #[inline] pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result { - let secs = ts_us / 1_000_000; - let nsecs = ((ts_us % 1_000_000) * 1000) as u32; - do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_nanos() / 1000) + let mut secs = ts_us / 1_000_000; + let mut nsecs = ((ts_us % 1_000_000) * 1000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math(secs, nsecs as u32, scalar, sign) + .map(|dt| dt.timestamp_nanos() / 1000) } #[inline] @@ -1167,9 +1176,13 @@ pub fn microseconds_add_array( #[inline] pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result { - let secs = ts_ns / 1_000_000_000; - let nsecs = (ts_ns % 1_000_000_000) as u32; - do_date_time_math(secs, nsecs, scalar, sign).map(|dt| dt.timestamp_nanos()) + let mut secs = ts_ns / 1_000_000_000; + let mut nsecs = (ts_ns % 1_000_000_000) as i32; + if nsecs < 0 { + secs -= 1; + nsecs += 1_000_000_000; + } + do_date_time_math(secs, nsecs as u32, scalar, sign).map(|dt| dt.timestamp_nanos()) } #[inline] @@ -1226,7 +1239,7 @@ fn do_date_time_math( ) -> Result { let prior = NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or_else(|| { DataFusionError::Internal(format!( - "Could not conert to NaiveDateTime: secs {secs} nsecs {nsecs} scalar {scalar:?} sign {sign}" + "Could not convert to NaiveDateTime: secs {secs} nsecs {nsecs} scalar {scalar:?} sign {sign}" )) })?; do_date_math(prior, scalar, sign) @@ -1241,7 +1254,7 @@ fn do_date_time_math_array( ) -> Result { let prior = NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or_else(|| { DataFusionError::Internal(format!( - "Could not conert to NaiveDateTime: secs {secs} nsecs {nsecs}" + "Could not convert to NaiveDateTime: secs {secs} nsecs {nsecs}" )) })?; do_date_math_array::<_, INTERVAL_MODE>(prior, interval, sign) @@ -1697,6 +1710,27 @@ impl ScalarValue { DataType::UInt64 => ScalarValue::UInt64(Some(0)), DataType::Float32 => ScalarValue::Float32(Some(0.0)), DataType::Float64 => ScalarValue::Float64(Some(0.0)), + DataType::Timestamp(TimeUnit::Second, tz) => { + ScalarValue::TimestampSecond(Some(0), tz.clone()) + } + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + ScalarValue::TimestampMillisecond(Some(0), tz.clone()) + } + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + ScalarValue::TimestampMicrosecond(Some(0), tz.clone()) + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + ScalarValue::TimestampNanosecond(Some(0), tz.clone()) + } + DataType::Interval(IntervalUnit::YearMonth) => { + ScalarValue::IntervalYearMonth(Some(0)) + } + DataType::Interval(IntervalUnit::DayTime) => { + ScalarValue::IntervalDayTime(Some(0)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + ScalarValue::IntervalMonthDayNano(Some(0)) + } _ => { return Err(DataFusionError::NotImplemented(format!( "Can't create a zero scalar from data_type \"{datatype:?}\"" @@ -2785,6 +2819,20 @@ impl ScalarValue { tz_opt ) } + DataType::Interval(IntervalUnit::YearMonth) => { + typed_cast!(array, index, IntervalYearMonthArray, IntervalYearMonth) + } + DataType::Interval(IntervalUnit::DayTime) => { + typed_cast!(array, index, IntervalDayTimeArray, IntervalDayTime) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + typed_cast!( + array, + index, + IntervalMonthDayNanoArray, + IntervalMonthDayNano + ) + } DataType::Dictionary(key_type, _) => { let (values_array, values_index) = match key_type.as_ref() { DataType::Int8 => get_dict_value::(array, index), diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 3af983d8f06a..848aeb98870e 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1423,17 +1423,20 @@ impl SymmetricHashJoinStream { mod tests { use std::fs::File; - use arrow::array::ArrayRef; + use arrow::array::{ArrayRef, IntervalDayTimeArray, TimestampMillisecondArray}; use arrow::array::{Int32Array, TimestampNanosecondArray}; use arrow::compute::SortOptions; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use arrow::util::pretty::pretty_format_batches; use rstest::*; use tempfile::TempDir; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{binary, col, Column}; - use datafusion_physical_expr::intervals::test_utils::gen_conjunctive_numeric_expr; + use datafusion_physical_expr::intervals::test_utils::{ + gen_conjunctive_interval_expr, gen_conjunctive_numeric_expr, + gen_conjunctive_timestamp_expr, + }; use datafusion_physical_expr::PhysicalExpr; use crate::physical_plan::joins::{ @@ -1658,6 +1661,42 @@ mod tests { _ => unreachable!(), } } + fn join_expr_tests_fixture_temporal( + expr_id: usize, + left_col: Arc, + right_col: Arc, + schema: &Schema, + ) -> Arc { + match expr_id { + 0 => gen_conjunctive_interval_expr( + left_col, + right_col, + Operator::Minus, + Operator::Minus, + Operator::Minus, + Operator::Minus, + 5, + 4, + 3, + 2, + schema, + ), + 1 => gen_conjunctive_timestamp_expr( + left_col, + right_col, + Operator::Minus, + Operator::Minus, + Operator::Minus, + Operator::Minus, + 5, + 4, + 3, + 2, + schema, + ), + _ => unreachable!(), + } + } fn build_sides_record_batches( table_size: i32, key_cardinality: (i32, i32), @@ -1733,6 +1772,41 @@ mod tests { Ok((left, right)) } + fn build_sides_record_batches_temporal( + table_size: i32, + key_cardinality: i32, + ) -> Result<(RecordBatch, RecordBatch)> { + let initial_range = 0..table_size; + let ordered: ArrayRef = Arc::new(TimestampMillisecondArray::from( + initial_range + .clone() + .map(|x| x as i64 + 1672531200000) + .collect::>(), + )); + let ordered2: ArrayRef = Arc::new(IntervalDayTimeArray::from( + initial_range + .clone() + .map(|x| x as i64 * 15) + .collect::>(), + )); + let cardinality_key = Arc::new(Int32Array::from_iter( + initial_range + .map(|x| x % key_cardinality) + .collect::>(), + )); + let left = RecordBatch::try_from_iter(vec![ + ("la1", ordered.clone()), + ("la2", ordered2.clone()), + ("lc1", cardinality_key.clone()), + ])?; + let right = RecordBatch::try_from_iter(vec![ + ("ra1", ordered), + ("ra2", ordered2), + ("rc1", cardinality_key), + ])?; + Ok((left, right)) + } + fn create_memory_table( left_batch: RecordBatch, right_batch: RecordBatch, @@ -2470,4 +2544,168 @@ mod tests { assert_eq!(left_side_joiner.visited_rows.is_empty(), should_be_empty); Ok(()) } + + #[tokio::test(flavor = "multi_thread")] + async fn with_temporal_columns() -> Result<()> { + let cardinality = 10; + let join_type = JoinType::Full; + let config = SessionConfig::new().with_repartition_joins(false); + let session_ctx = SessionContext::with_config(config); + let task_ctx = session_ctx.task_ctx(); + let (left_batch, right_batch) = + build_sides_record_batches_temporal(TABLE_SIZE, cardinality)?; + let left_schema = &left_batch.schema(); + let right_schema = &right_batch.schema(); + let on = vec![( + Column::new_with_schema("lc1", left_schema)?, + Column::new_with_schema("rc1", right_schema)?, + )]; + // test for timestamp - interval + let left_sorted = vec![PhysicalSortExpr { + expr: col("la1", left_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let right_sorted = vec![PhysicalSortExpr { + expr: col("ra1", right_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let (left, right) = create_memory_table( + left_batch.clone(), + right_batch.clone(), + left_sorted, + right_sorted, + 13, + )?; + let intermediate_schema = Schema::new(vec![ + Field::new( + "left", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new( + "right", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + ]); + let filter_expr = join_expr_tests_fixture_temporal( + 0, + col("left", &intermediate_schema)?, + col("right", &intermediate_schema)?, + &intermediate_schema, + ); + let column_indices = vec![ + ColumnIndex { + index: 0, + side: JoinSide::Left, + }, + ColumnIndex { + index: 0, + side: JoinSide::Right, + }, + ]; + let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); + experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; + + // test for timestamp - timestamp + let left_sorted = vec![PhysicalSortExpr { + expr: col("la1", left_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let right_sorted = vec![PhysicalSortExpr { + expr: col("ra1", right_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let (left, right) = create_memory_table( + left_batch.clone(), + right_batch.clone(), + left_sorted, + right_sorted, + 13, + )?; + let intermediate_schema = Schema::new(vec![ + Field::new( + "left", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new( + "right", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + ]); + let filter_expr = join_expr_tests_fixture_temporal( + 1, + col("left", &intermediate_schema)?, + col("right", &intermediate_schema)?, + &intermediate_schema, + ); + let column_indices = vec![ + ColumnIndex { + index: 0, + side: JoinSide::Left, + }, + ColumnIndex { + index: 0, + side: JoinSide::Right, + }, + ]; + let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); + experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; + + // test for interval - interval + let left_sorted = vec![PhysicalSortExpr { + expr: col("la2", left_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let right_sorted = vec![PhysicalSortExpr { + expr: col("ra2", right_schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + let (left, right) = + create_memory_table(left_batch, right_batch, left_sorted, right_sorted, 13)?; + let intermediate_schema = Schema::new(vec![ + Field::new("left", DataType::Interval(IntervalUnit::DayTime), false), + Field::new("right", DataType::Interval(IntervalUnit::DayTime), false), + ]); + let filter_expr = join_expr_tests_fixture_temporal( + 0, + col("left", &intermediate_schema)?, + col("right", &intermediate_schema)?, + &intermediate_schema, + ); + let column_indices = vec![ + ColumnIndex { + index: 1, + side: JoinSide::Left, + }, + ColumnIndex { + index: 1, + side: JoinSide::Right, + }, + ]; + let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); + experiment(left, right, filter, join_type, on, task_ctx).await?; + + Ok(()) + } } diff --git a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt index 94b954bfde1e..27c7af23aa18 100644 --- a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt +++ b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt @@ -279,7 +279,7 @@ query error Cannot automatically convert Interval\(DayTime\) to Interval\(MonthD --- select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)'); -query error DataFusion error: This feature is not implemented: Can't create a scalar from array of type "Interval\(MonthDayNano\)" +statement ok select arrow_cast('30 minutes', 'Interval(MonthDayNano)'); diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 711679a9d36d..69f024a920e9 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -469,6 +469,102 @@ macro_rules! min_max { ) => { typed_min_max!(lhs, rhs, Time64Nanosecond, $OP) } + ( + ScalarValue::IntervalYearMonth(lhs), + ScalarValue::IntervalYearMonth(rhs), + ) => { + typed_min_max!(lhs, rhs, IntervalYearMonth, $OP) + } + ( + ScalarValue::IntervalYearMonth(lhs), + ScalarValue::IntervalDayTime(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalYearMonth(*lhs).partial_cmp(&ScalarValue::IntervalDayTime(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalDayTime(*rhs), + (_, Some(_)) => ScalarValue::IntervalYearMonth(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalYearMonth(*lhs), ScalarValue::IntervalDayTime(*rhs) + ))) + } + } + ( + ScalarValue::IntervalYearMonth(lhs), + ScalarValue::IntervalMonthDayNano(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalYearMonth(*lhs).partial_cmp(&ScalarValue::IntervalMonthDayNano(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalMonthDayNano(*rhs), + (_, Some(_)) => ScalarValue::IntervalYearMonth(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalYearMonth(*lhs), ScalarValue::IntervalMonthDayNano(*rhs) + ))) + } + } + ( + ScalarValue::IntervalDayTime(lhs), + ScalarValue::IntervalDayTime(rhs), + ) => { + typed_min_max!(lhs, rhs, IntervalDayTime, $OP) + } + ( + ScalarValue::IntervalDayTime(lhs), + ScalarValue::IntervalYearMonth(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalDayTime(*lhs).partial_cmp(&ScalarValue::IntervalYearMonth(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalYearMonth(*rhs), + (_, Some(_)) => ScalarValue::IntervalDayTime(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalDayTime(*lhs), ScalarValue::IntervalYearMonth(*rhs) + ))) + } + } + ( + ScalarValue::IntervalDayTime(lhs), + ScalarValue::IntervalMonthDayNano(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalDayTime(*lhs).partial_cmp(&ScalarValue::IntervalMonthDayNano(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalMonthDayNano(*rhs), + (_, Some(_)) => ScalarValue::IntervalDayTime(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalDayTime(*lhs), ScalarValue::IntervalMonthDayNano(*rhs) + ))) + } + } + ( + ScalarValue::IntervalMonthDayNano(lhs), + ScalarValue::IntervalMonthDayNano(rhs), + ) => { + typed_min_max!(lhs, rhs, IntervalMonthDayNano, $OP) + } + ( + ScalarValue::IntervalMonthDayNano(lhs), + ScalarValue::IntervalYearMonth(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalMonthDayNano(*lhs).partial_cmp(&ScalarValue::IntervalYearMonth(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalYearMonth(*rhs), + (_, Some(_)) => ScalarValue::IntervalMonthDayNano(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalMonthDayNano(*lhs), ScalarValue::IntervalYearMonth(*rhs) + ))) + } + } + ( + ScalarValue::IntervalMonthDayNano(lhs), + ScalarValue::IntervalDayTime(rhs), + ) => { + match (stringify!($OP) , ScalarValue::IntervalMonthDayNano(*lhs).partial_cmp(&ScalarValue::IntervalDayTime(*rhs))) { + ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalDayTime(*rhs), + (_, Some(_)) => ScalarValue::IntervalMonthDayNano(*lhs), + (_,_) => return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", + ScalarValue::IntervalMonthDayNano(*lhs), ScalarValue::IntervalDayTime(*rhs) + ))) + } + } e => { return Err(DataFusionError::Internal(format!( "MIN/MAX is not expected to receive scalars of incompatible types {:?}", diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 4e65a9fdd539..c074a194d05c 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -48,10 +48,18 @@ use arrow::compute::kernels::comparison::{ eq_dyn_utf8_scalar, gt_dyn_utf8_scalar, gt_eq_dyn_utf8_scalar, lt_dyn_utf8_scalar, lt_eq_dyn_utf8_scalar, neq_dyn_utf8_scalar, }; +use arrow::compute::{try_unary, unary}; use arrow::datatypes::*; use adapter::{eq_dyn, gt_dyn, gt_eq_dyn, lt_dyn, lt_eq_dyn, neq_dyn}; use arrow::compute::kernels::concat_elements::concat_elements_utf8; +use chrono::NaiveDateTime; +use datafusion_common::scalar::{ + calculate_naives, microseconds_add, microseconds_sub, milliseconds_add, + milliseconds_sub, nanoseconds_add, nanoseconds_sub, op_dt, op_dt_mdn, op_mdn, op_ym, + op_ym_dt, op_ym_mdn, parse_timezones, seconds_add, seconds_sub, MILLISECOND_MODE, + NANOSECOND_MODE, +}; use datafusion_expr::type_coercion::{is_timestamp, is_utf8_or_large_utf8}; use kernels::{ bitwise_and, bitwise_and_scalar, bitwise_or, bitwise_or_scalar, bitwise_shift_left, @@ -77,7 +85,11 @@ use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; use crate::intervals::{apply_operator, Interval}; use crate::physical_expr::down_cast_any_ref; use crate::{analysis_expect, AnalysisContext, ExprBoundaries, PhysicalExpr}; -use datafusion_common::cast::as_boolean_array; +use datafusion_common::cast::{ + as_boolean_array, as_interval_dt_array, as_interval_mdn_array, as_interval_ym_array, + as_timestamp_microsecond_array, as_timestamp_millisecond_array, + as_timestamp_nanosecond_array, as_timestamp_second_array, +}; use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::type_coercion::binary::binary_operator_data_type; @@ -1229,6 +1241,257 @@ pub fn binary( Ok(Arc::new(BinaryExpr::new(lhs, op, rhs))) } +/// This function handles the Timestamp - Timestamp operations, +/// where the first one is an array, and the second one is a scalar, +/// hence the result is also an array. +pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { + let ret = match (array.data_type(), scalar) { + ( + DataType::Timestamp(TimeUnit::Second, opt_tz_lhs), + ScalarValue::TimestampSecond(Some(rhs), opt_tz_rhs), + ) => { + let prim_array = as_timestamp_second_array(&array)?; + let ret: PrimitiveArray = + arrow::compute::try_unary(prim_array, |lhs| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); + let (naive_lhs, naive_rhs) = calculate_naives::( + lhs.mul_wrapping(1000), + parsed_lhs_tz, + rhs.mul_wrapping(1000), + parsed_rhs_tz, + )?; + Ok(seconds_sub( + NaiveDateTime::timestamp(&naive_lhs), + NaiveDateTime::timestamp(&naive_rhs), + )) + })?; + Arc::new(ret) as ArrayRef + } + ( + DataType::Timestamp(TimeUnit::Millisecond, opt_tz_lhs), + ScalarValue::TimestampMillisecond(Some(rhs), opt_tz_rhs), + ) => { + let prim_array = as_timestamp_millisecond_array(&array)?; + let ret: PrimitiveArray = + arrow::compute::try_unary(prim_array, |lhs| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); + let (naive_lhs, naive_rhs) = calculate_naives::( + lhs, + parsed_lhs_tz, + *rhs, + parsed_rhs_tz, + )?; + Ok(milliseconds_sub( + NaiveDateTime::timestamp_millis(&naive_lhs), + NaiveDateTime::timestamp_millis(&naive_rhs), + )) + })?; + Arc::new(ret) as ArrayRef + } + ( + DataType::Timestamp(TimeUnit::Microsecond, opt_tz_lhs), + ScalarValue::TimestampMicrosecond(Some(rhs), opt_tz_rhs), + ) => { + let prim_array = as_timestamp_microsecond_array(&array)?; + let ret: PrimitiveArray = + arrow::compute::try_unary(prim_array, |lhs| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); + let (naive_lhs, naive_rhs) = calculate_naives::( + lhs.mul_wrapping(1000), + parsed_lhs_tz, + rhs.mul_wrapping(1000), + parsed_rhs_tz, + )?; + Ok(microseconds_sub( + NaiveDateTime::timestamp_micros(&naive_lhs), + NaiveDateTime::timestamp_micros(&naive_rhs), + )) + })?; + Arc::new(ret) as ArrayRef + } + ( + DataType::Timestamp(TimeUnit::Nanosecond, opt_tz_lhs), + ScalarValue::TimestampNanosecond(Some(rhs), opt_tz_rhs), + ) => { + let prim_array = as_timestamp_nanosecond_array(&array)?; + let ret: PrimitiveArray = + arrow::compute::try_unary(prim_array, |lhs| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); + let (naive_lhs, naive_rhs) = calculate_naives::( + lhs, + parsed_lhs_tz, + *rhs, + parsed_rhs_tz, + )?; + Ok(nanoseconds_sub( + NaiveDateTime::timestamp_nanos(&naive_lhs), + NaiveDateTime::timestamp_nanos(&naive_rhs), + )) + })?; + Arc::new(ret) as ArrayRef + } + (_, _) => { + return Err(DataFusionError::Internal(format!( + "Invalid array - scalar types for Timestamp subtraction: {:?} - {:?}", + array.data_type(), + scalar.get_datatype() + ))); + } + }; + Ok(ColumnarValue::Array(ret)) +} +/// This function handles the Timestamp - Interval operations, +/// where the first one is an array, and the second one is a scalar, +/// hence the result is also an array. +pub fn ts_scalar_interval_op( + array: ArrayRef, + sign: i32, + scalar: &ScalarValue, +) -> Result { + let ret = match array.data_type() { + DataType::Timestamp(TimeUnit::Second, tz) => { + let array = as_timestamp_second_array(&array)?; + let ret: PrimitiveArray = + try_unary::( + array, + |ts_s| Ok(seconds_add(ts_s, scalar, sign)?), + )?; + Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + } + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + let array = as_timestamp_millisecond_array(&array)?; + let ret: PrimitiveArray = + try_unary::( + array, + |ts_ms| Ok(milliseconds_add(ts_ms, scalar, sign)?), + )?; + Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + } + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + let array = as_timestamp_microsecond_array(&array)?; + let ret: PrimitiveArray = + try_unary::( + array, + |ts_us| Ok(microseconds_add(ts_us, scalar, sign)?), + )?; + Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + let array = as_timestamp_nanosecond_array(&array)?; + let ret: PrimitiveArray = + try_unary::( + array, + |ts_ns| Ok(nanoseconds_add(ts_ns, scalar, sign)?), + )?; + Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + } + _ => Err(DataFusionError::Internal(format!( + "Invalid lhs type for Timestamp vs Interval operations: {}", + array.data_type() + )))?, + }; + Ok(ColumnarValue::Array(ret)) +} +pub fn interval_scalar_interval_op( + array: ArrayRef, + sign: i32, + scalar: &ScalarValue, +) -> Result { + let ret = match (array.data_type(), scalar) { + ( + DataType::Interval(IntervalUnit::YearMonth), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => { + let array = as_interval_ym_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_ym(lhs, *rhs, sign)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::YearMonth), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => { + let array = as_interval_ym_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_ym_dt(lhs, *rhs, sign, false)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::YearMonth), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => { + let array = as_interval_ym_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_ym_mdn(lhs, *rhs, sign, false)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::DayTime), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => { + let array = as_interval_dt_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_ym_dt(*rhs, lhs, sign, true)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::DayTime), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => { + let array = as_interval_dt_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_dt(*rhs, lhs, sign)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::DayTime), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => { + let array = as_interval_dt_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_dt_mdn(lhs, *rhs, sign, false)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::MonthDayNano), + ScalarValue::IntervalYearMonth(Some(rhs)), + ) => { + let array = as_interval_mdn_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_ym_mdn(*rhs, lhs, sign, true)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::MonthDayNano), + ScalarValue::IntervalDayTime(Some(rhs)), + ) => { + let array = as_interval_mdn_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_dt_mdn(*rhs, lhs, sign, true)); + Arc::new(ret) as ArrayRef + } + ( + DataType::Interval(IntervalUnit::MonthDayNano), + ScalarValue::IntervalMonthDayNano(Some(rhs)), + ) => { + let array = as_interval_mdn_array(&array)?; + let ret: PrimitiveArray = + unary(array, |lhs| op_mdn(*rhs, lhs, sign)); + Arc::new(ret) as ArrayRef + } + _ => Err(DataFusionError::Internal(format!( + "Invalid operands for Interval vs Interval operations: {} - {}", + array.data_type(), + scalar.get_datatype(), + )))?, + }; + Ok(ColumnarValue::Array(ret)) +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 518a28268765..a0b225dd9693 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; +use crate::intervals::{apply_operator, Interval}; use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use arrow::compute::{binary, unary}; +use arrow::compute::{binary, try_unary}; use arrow::datatypes::{ ArrowNativeTypeOp, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, Schema, TimeUnit, @@ -38,6 +40,10 @@ use std::any::Any; use std::fmt::{Display, Formatter}; use std::sync::Arc; +use super::binary::{ + interval_scalar_interval_op, ts_scalar_interval_op, ts_scalar_ts_op, +}; + /// Perform DATE/TIME/TIMESTAMP +/ INTERVAL math #[derive(Debug)] pub struct DateTimeIntervalExpr { @@ -68,6 +74,7 @@ impl DateTimeIntervalExpr { DataType::Interval(_), ) | (DataType::Timestamp(_, _), Operator::Minus, DataType::Timestamp(_, _)) + | (DataType::Interval(_), Operator::Plus, DataType::Timestamp(_, _)) | ( DataType::Interval(_), Operator::Plus | Operator::Minus, @@ -79,7 +86,7 @@ impl DateTimeIntervalExpr { input_schema: input_schema.clone(), }), (lhs, _, rhs) => Err(DataFusionError::Execution(format!( - "Invalid operation between '{lhs}' and '{rhs}' for DateIntervalExpr" + "Invalid operation {op} between '{lhs}' and '{rhs}' for DateIntervalExpr" ))), } } @@ -150,7 +157,7 @@ impl PhysicalExpr for DateTimeIntervalExpr { })) } (ColumnarValue::Array(array_lhs), ColumnarValue::Scalar(operand_rhs)) => { - evaluate_array(array_lhs, sign, &operand_rhs) + evaluate_temporal_array(array_lhs, sign, &operand_rhs) } (ColumnarValue::Array(array_lhs), ColumnarValue::Array(array_rhs)) => { @@ -163,6 +170,42 @@ impl PhysicalExpr for DateTimeIntervalExpr { } } + fn evaluate_bounds(&self, children: &[&Interval]) -> Result { + // Get children intervals: + let left_interval = children[0]; + let right_interval = children[1]; + // Calculate current node's interval: + apply_operator(&self.op, left_interval, right_interval) + } + + fn propagate_constraints( + &self, + interval: &Interval, + children: &[&Interval], + ) -> Result>> { + // Get children intervals. Graph brings + let left_interval = children[0]; + let right_interval = children[1]; + let (left, right) = if self.op.is_comparison_operator() { + if let Interval { + lower: ScalarValue::Boolean(Some(false)), + upper: ScalarValue::Boolean(Some(false)), + } = interval + { + // TODO: We will handle strictly false clauses by negating + // the comparison operator (e.g. GT to LE, LT to GE) + // once open/closed intervals are supported. + return Ok(vec![]); + } + // Propagate the comparison operator. + propagate_comparison(&self.op, left_interval, right_interval)? + } else { + // Propagate the arithmetic operator. + propagate_arithmetic(&self.op, interval, left_interval, right_interval)? + }; + Ok(vec![left, right]) + } + fn children(&self) -> Vec> { vec![self.lhs.clone(), self.rhs.clone()] } @@ -189,64 +232,44 @@ impl PartialEq for DateTimeIntervalExpr { } } -pub fn evaluate_array( +pub fn evaluate_temporal_array( array: ArrayRef, sign: i32, scalar: &ScalarValue, ) -> Result { - let ret = match array.data_type() { - DataType::Date32 => { - let array = as_date32_array(&array)?; - Arc::new(unary::(array, |days| { - date32_add(days, scalar, sign).unwrap() - })) as ArrayRef + match (array.data_type(), scalar.get_datatype()) { + // Timestamp - Timestamp + (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) if sign == -1 => { + ts_scalar_ts_op(array, scalar) } - DataType::Date64 => { - let array = as_date64_array(&array)?; - Arc::new(unary::(array, |ms| { - date64_add(ms, scalar, sign).unwrap() - })) as ArrayRef - } - DataType::Timestamp(TimeUnit::Second, _) => { - let array = as_timestamp_second_array(&array)?; - Arc::new(unary::( - array, - |ts_s| seconds_add(ts_s, scalar, sign).unwrap(), - )) as ArrayRef + // Interval +- Interval + (DataType::Interval(_), DataType::Interval(_)) => { + interval_scalar_interval_op(array, sign, scalar) } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let array = as_timestamp_millisecond_array(&array)?; - Arc::new( - unary::( - array, - |ts_ms| milliseconds_add(ts_ms, scalar, sign).unwrap(), - ), - ) as ArrayRef + // Timestamp +- Interval + (DataType::Timestamp(_, _), DataType::Interval(_)) => { + ts_scalar_interval_op(array, sign, scalar) } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let array = as_timestamp_microsecond_array(&array)?; - Arc::new( - unary::( - array, - |ts_us| microseconds_add(ts_us, scalar, sign).unwrap(), - ), - ) as ArrayRef + // Date +- Interval + (DataType::Date32, DataType::Interval(_)) => { + let array = as_date32_array(&array)?; + let ret = Arc::new(try_unary::(array, |days| { + Ok(date32_add(days, scalar, sign)?) + })?) as ArrayRef; + Ok(ColumnarValue::Array(ret)) } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let array = as_timestamp_nanosecond_array(&array)?; - Arc::new( - unary::( - array, - |ts_ns| nanoseconds_add(ts_ns, scalar, sign).unwrap(), - ), - ) as ArrayRef + (DataType::Date64, DataType::Interval(_)) => { + let array = as_date64_array(&array)?; + let ret = Arc::new(try_unary::(array, |ms| { + Ok(date64_add(ms, scalar, sign)?) + })?) as ArrayRef; + Ok(ColumnarValue::Array(ret)) } - _ => Err(DataFusionError::Execution(format!( + (_, _) => Err(DataFusionError::Execution(format!( "Invalid lhs type for DateIntervalExpr: {}", array.data_type() )))?, - }; - Ok(ColumnarValue::Array(ret)) + } } macro_rules! ts_sub_op { diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index 66367001c642..c393e1ce7d5f 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use arrow_schema::DataType; use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::type_coercion::binary::coerce_types; use datafusion_expr::Operator; use petgraph::graph::NodeIndex; use petgraph::stable_graph::{DefaultIx, StableGraph}; @@ -237,9 +238,14 @@ pub fn propagate_arithmetic( /// If we have expression < 0, expression must have the range [-∞, 0]. /// Currently, we only support strict inequalities since open/closed intervals /// are not implemented yet. -fn comparison_operator_target(datatype: &DataType, op: &Operator) -> Result { - let unbounded = ScalarValue::try_from(datatype)?; - let zero = ScalarValue::new_zero(datatype)?; +fn comparison_operator_target( + left_datatype: &DataType, + op: &Operator, + right_datatype: &DataType, +) -> Result { + let datatype = coerce_types(left_datatype, &Operator::Minus, right_datatype)?; + let unbounded = ScalarValue::try_from(&datatype)?; + let zero = ScalarValue::new_zero(&datatype)?; Ok(match *op { Operator::Gt => Interval { lower: zero, @@ -265,7 +271,11 @@ pub fn propagate_comparison( left_child: &Interval, right_child: &Interval, ) -> Result<(Option, Option)> { - let parent = comparison_operator_target(&left_child.get_datatype(), op)?; + let parent = comparison_operator_target( + &left_child.get_datatype(), + op, + &right_child.get_datatype(), + )?; propagate_arithmetic(&Operator::Minus, &parent, left_child, right_child) } diff --git a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs index 7fc3641b25ef..c3160bf742d1 100644 --- a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs +++ b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs @@ -24,6 +24,7 @@ use std::fmt::{Display, Formatter}; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::DataType; use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::type_coercion::binary::coerce_types; use datafusion_expr::Operator; use crate::aggregate::min_max::{max, min}; @@ -201,13 +202,20 @@ impl Interval { /// one can choose single values arbitrarily from each of the operands. pub fn add>(&self, other: T) -> Result { let rhs = other.borrow(); + let mut datatype = + coerce_types(&self.get_datatype(), &Operator::Minus, &rhs.get_datatype()); + if datatype.is_err() { + datatype = + coerce_types(&rhs.get_datatype(), &Operator::Minus, &self.get_datatype()); + } + let datatype = datatype?; let lower = if self.lower.is_null() || rhs.lower.is_null() { - ScalarValue::try_from(self.lower.get_datatype()) + ScalarValue::try_from(&datatype) } else { self.lower.add(&rhs.lower) }?; let upper = if self.upper.is_null() || rhs.upper.is_null() { - ScalarValue::try_from(self.upper.get_datatype()) + ScalarValue::try_from(&datatype) } else { self.upper.add(&rhs.upper) }?; @@ -221,12 +229,20 @@ impl Interval { pub fn sub>(&self, other: T) -> Result { let rhs = other.borrow(); let lower = if self.lower.is_null() || rhs.upper.is_null() { - ScalarValue::try_from(self.lower.get_datatype()) + ScalarValue::try_from(coerce_types( + &self.lower.get_datatype(), + &Operator::Minus, + &rhs.lower.get_datatype(), + )?) } else { self.lower.sub(&rhs.upper) }?; let upper = if self.upper.is_null() || rhs.lower.is_null() { - ScalarValue::try_from(self.upper.get_datatype()) + ScalarValue::try_from(coerce_types( + &self.upper.get_datatype(), + &Operator::Minus, + &rhs.upper.get_datatype(), + )?) } else { self.upper.sub(&rhs.lower) }?; diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs index ba02f4ff7aac..7929b05b7d14 100644 --- a/datafusion/physical-expr/src/intervals/test_utils.rs +++ b/datafusion/physical-expr/src/intervals/test_utils.rs @@ -19,8 +19,9 @@ use std::sync::Arc; -use crate::expressions::{BinaryExpr, Literal}; +use crate::expressions::{BinaryExpr, DateTimeIntervalExpr, Literal}; use crate::PhysicalExpr; +use arrow_schema::Schema; use datafusion_common::ScalarValue; use datafusion_expr::Operator; @@ -65,3 +66,131 @@ pub fn gen_conjunctive_numeric_expr( let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr)) } + +#[allow(clippy::too_many_arguments)] +/// This test function generates a conjunctive statement with +/// two timestamp terms with the following form: +/// left_col (op_1) a > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d +pub fn gen_conjunctive_interval_expr( + left_col: Arc, + right_col: Arc, + op_1: Operator, + op_2: Operator, + op_3: Operator, + op_4: Operator, + a: i32, + b: i32, + c: i32, + d: i32, + schema: &Schema, +) -> Arc { + let left_and_1 = Arc::new( + DateTimeIntervalExpr::try_new( + left_col.clone(), + op_1, + Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(a.into())))), + schema, + ) + .unwrap(), + ); + let left_and_2 = Arc::new( + DateTimeIntervalExpr::try_new( + right_col.clone(), + op_2, + Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(b.into())))), + schema, + ) + .unwrap(), + ); + let right_and_1 = Arc::new( + DateTimeIntervalExpr::try_new( + left_col, + op_3, + Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(c.into())))), + schema, + ) + .unwrap(), + ); + let right_and_2 = Arc::new( + DateTimeIntervalExpr::try_new( + right_col, + op_4, + Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(d.into())))), + schema, + ) + .unwrap(), + ); + let left_expr = Arc::new(BinaryExpr::new(left_and_1, Operator::Gt, left_and_2)); + let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); + Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr)) +} + +#[allow(clippy::too_many_arguments)] +/// This test function generates a conjunctive statement with +/// one timestamp and one interval term with the following form: +/// left_col (op_1) a > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d +pub fn gen_conjunctive_timestamp_expr( + left_col: Arc, + right_col: Arc, + op_1: Operator, + op_2: Operator, + op_3: Operator, + op_4: Operator, + a: i32, + b: i32, + c: i32, + d: i32, + schema: &Schema, +) -> Arc { + let left_and_1 = Arc::new( + DateTimeIntervalExpr::try_new( + left_col.clone(), + op_1, + Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Some(a.into()), + None, + ))), + schema, + ) + .unwrap(), + ); + let left_and_2 = Arc::new( + DateTimeIntervalExpr::try_new( + right_col.clone(), + op_2, + Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Some(b.into()), + None, + ))), + schema, + ) + .unwrap(), + ); + let right_and_1 = Arc::new( + DateTimeIntervalExpr::try_new( + left_col, + op_3, + Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Some(c.into()), + None, + ))), + schema, + ) + .unwrap(), + ); + let right_and_2 = Arc::new( + DateTimeIntervalExpr::try_new( + right_col, + op_4, + Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Some(d.into()), + None, + ))), + schema, + ) + .unwrap(), + ); + let left_expr = Arc::new(BinaryExpr::new(left_and_1, Operator::Gt, left_and_2)); + let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); + Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr)) +} From 8bd24548a1345dc284a0bfaea1ff0ab71ef8334e Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 3 Apr 2023 15:58:30 +0300 Subject: [PATCH 36/55] correction after merge --- datafusion/common/src/scalar.rs | 91 ------------------- .../physical-expr/src/expressions/datetime.rs | 91 ------------------- 2 files changed, 182 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 875a7ad7d479..431c52b75890 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -1146,22 +1146,6 @@ pub fn milliseconds_add_array( .map(|dt| dt.timestamp_millis()) } -#[inline] -pub fn milliseconds_add_array( - ts_ms: i64, - interval: i128, - sign: i32, -) -> Result { - let mut secs = ts_ms / 1000; - let mut nsecs = ((ts_ms % 1000) * 1_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } - do_date_time_math_array::(secs, nsecs as u32, interval, sign) - .map(|dt| dt.timestamp_millis()) -} - #[inline] pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result { let mut secs = ts_us / 1_000_000; @@ -1190,22 +1174,6 @@ pub fn microseconds_add_array( .map(|dt| dt.timestamp_nanos() / 1000) } -#[inline] -pub fn microseconds_add_array( - ts_us: i64, - interval: i128, - sign: i32, -) -> Result { - let mut secs = ts_us / 1_000_000; - let mut nsecs = ((ts_us % 1_000_000) * 1000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } - do_date_time_math_array::(secs, nsecs as u32, interval, sign) - .map(|dt| dt.timestamp_nanos() / 1000) -} - #[inline] pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result { let mut secs = ts_ns / 1_000_000_000; @@ -1262,51 +1230,6 @@ pub fn nanoseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i128 { IntervalMonthDayNanoType::make_value(0, days, nanos) } -#[inline] -pub fn nanoseconds_add_array( - ts_ns: i64, - interval: i128, - sign: i32, -) -> Result { - let mut secs = ts_ns / 1_000_000_000; - let mut nsecs = (ts_ns % 1_000_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } - do_date_time_math_array::(secs, nsecs as u32, interval, sign) - .map(|dt| dt.timestamp_nanos()) -} - -#[inline] -pub fn seconds_sub(ts_lhs: i64, ts_rhs: i64) -> i64 { - let diff_ms = (ts_lhs - ts_rhs) * 1000; - let days = (diff_ms / MILLISECS_IN_ONE_DAY) as i32; - let millis = (diff_ms % MILLISECS_IN_ONE_DAY) as i32; - IntervalDayTimeType::make_value(days, millis) -} -#[inline] -pub fn milliseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i64 { - let diff_ms = ts_lhs - ts_rhs; - let days = (diff_ms / MILLISECS_IN_ONE_DAY) as i32; - let millis = (diff_ms % MILLISECS_IN_ONE_DAY) as i32; - IntervalDayTimeType::make_value(days, millis) -} -#[inline] -pub fn microseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i128 { - let diff_ns = (ts_lhs - ts_rhs) * 1000; - let days = (diff_ns / NANOSECS_IN_ONE_DAY) as i32; - let nanos = diff_ns % NANOSECS_IN_ONE_DAY; - IntervalMonthDayNanoType::make_value(0, days, nanos) -} -#[inline] -pub fn nanoseconds_sub(ts_lhs: i64, ts_rhs: i64) -> i128 { - let diff_ns = ts_lhs - ts_rhs; - let days = (diff_ns / NANOSECS_IN_ONE_DAY) as i32; - let nanos = diff_ns % NANOSECS_IN_ONE_DAY; - IntervalMonthDayNanoType::make_value(0, days, nanos) -} - #[inline] fn do_date_time_math( secs: i64, @@ -2973,20 +2896,6 @@ impl ScalarValue { }, ) } - DataType::Interval(IntervalUnit::DayTime) => { - typed_cast!(array, index, IntervalDayTimeArray, IntervalDayTime) - } - DataType::Interval(IntervalUnit::YearMonth) => { - typed_cast!(array, index, IntervalYearMonthArray, IntervalYearMonth) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - typed_cast!( - array, - index, - IntervalMonthDayNanoArray, - IntervalMonthDayNano - ) - } other => { return Err(DataFusionError::NotImplemented(format!( "Can't create a scalar from array of type \"{other:?}\"" diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 96427398f692..a0b225dd9693 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -363,97 +363,6 @@ pub fn evaluate_temporal_arrays( Ok(ColumnarValue::Array(ret)) } -macro_rules! ts_sub_op { - ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_out:ty) => {{ - let prim_array_lhs = $caster(&$lhs)?; - let prim_array_rhs = $caster(&$rhs)?; - let ret: PrimitiveArray<$type_out> = - arrow::compute::try_binary(prim_array_lhs, prim_array_rhs, |ts1, ts2| { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones($lhs_tz)?, parse_timezones($rhs_tz)?); - let (naive_lhs, naive_rhs) = calculate_naives::<$mode>( - ts1.mul_wrapping($coef), - parsed_lhs_tz, - ts2.mul_wrapping($coef), - parsed_rhs_tz, - )?; - Ok($op($ts_unit(&naive_lhs), $ts_unit(&naive_rhs))) - })?; - Arc::new(ret) as ArrayRef - }}; -} -macro_rules! interval_op { - ($lhs:ident, $rhs:ident, $caster:expr, $op:expr, $sign:ident, $type_in:ty) => {{ - let prim_array_lhs = $caster(&$lhs)?; - let prim_array_rhs = $caster(&$rhs)?; - let ret = Arc::new(binary::<$type_in, $type_in, _, $type_in>( - prim_array_lhs, - prim_array_rhs, - |interval1, interval2| $op(interval1, interval2, $sign), - )?) as ArrayRef; - ret - }}; -} -macro_rules! interval_cross_op { - ($lhs:ident, $rhs:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $commute:ident, $type_in1:ty, $type_in2:ty) => {{ - let prim_array_lhs = $caster1(&$lhs)?; - let prim_array_rhs = $caster2(&$rhs)?; - let ret = Arc::new(binary::<$type_in1, $type_in2, _, IntervalMonthDayNanoType>( - prim_array_lhs, - prim_array_rhs, - |interval1, interval2| $op(interval1, interval2, $sign, $commute), - )?) as ArrayRef; - ret - }}; -} -macro_rules! ts_interval_op { - ($lhs:ident, $rhs:ident, $tz:ident, $caster1:expr, $caster2:expr, $op:expr, $sign:ident, $type_in1:ty, $type_in2:ty) => {{ - let prim_array_lhs = $caster1(&$lhs)?; - let prim_array_rhs = $caster2(&$rhs)?; - let ret: PrimitiveArray<$type_in1> = arrow::compute::try_binary( - prim_array_lhs, - prim_array_rhs, - |ts, interval| Ok($op(ts, interval as i128, $sign)?), - )?; - Arc::new(ret.with_timezone_opt($tz.clone())) as ArrayRef - }}; -} -// This function evaluates temporal array operations, such as timestamp - timestamp, interval + interval, -// timestamp + interval, and interval + timestamp. It takes two arrays as input and an integer sign representing -// the operation (+1 for addition and -1 for subtraction). It returns a ColumnarValue as output, which can hold -// either a scalar or an array. -pub fn evaluate_temporal_arrays( - array_lhs: &ArrayRef, - sign: i32, - array_rhs: &ArrayRef, -) -> Result { - let ret = match (array_lhs.data_type(), array_rhs.data_type()) { - // Timestamp - Timestamp operations, operands of only the same types are supported. - (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) => { - ts_array_op(array_lhs, array_rhs)? - } - // Interval (+ , -) Interval operations - (DataType::Interval(_), DataType::Interval(_)) => { - interval_array_op(array_lhs, array_rhs, sign)? - } - // Timestamp (+ , -) Interval and Interval + Timestamp operations - // Interval - Timestamp operation is not rational hence not supported - (DataType::Timestamp(_, _), DataType::Interval(_)) => { - ts_interval_array_op(array_lhs, sign, array_rhs)? - } - (DataType::Interval(_), DataType::Timestamp(_, _)) if sign == 1 => { - ts_interval_array_op(array_rhs, sign, array_lhs)? - } - (_, _) => Err(DataFusionError::Execution(format!( - "Invalid array types for DateIntervalExpr: {:?} {} {:?}", - array_lhs.data_type(), - sign, - array_rhs.data_type() - )))?, - }; - Ok(ColumnarValue::Array(ret)) -} - /// Performs a timestamp subtraction operation on two arrays and returns the resulting array. fn ts_array_op(array_lhs: &ArrayRef, array_rhs: &ArrayRef) -> Result { match (array_lhs.data_type(), array_rhs.data_type()) { From 6486083493d63cc676634a65c9d82411cc3eed58 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 4 Apr 2023 13:37:45 +0300 Subject: [PATCH 37/55] change match order --- datafusion/common/src/scalar.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 431c52b75890..b99cebd7d280 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -2821,20 +2821,6 @@ impl ScalarValue { tz_opt ) } - DataType::Interval(IntervalUnit::YearMonth) => { - typed_cast!(array, index, IntervalYearMonthArray, IntervalYearMonth) - } - DataType::Interval(IntervalUnit::DayTime) => { - typed_cast!(array, index, IntervalDayTimeArray, IntervalDayTime) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - typed_cast!( - array, - index, - IntervalMonthDayNanoArray, - IntervalMonthDayNano - ) - } DataType::Dictionary(key_type, _) => { let (values_array, values_index) = match key_type.as_ref() { DataType::Int8 => get_dict_value::(array, index), @@ -2896,6 +2882,20 @@ impl ScalarValue { }, ) } + DataType::Interval(IntervalUnit::DayTime) => { + typed_cast!(array, index, IntervalDayTimeArray, IntervalDayTime) + } + DataType::Interval(IntervalUnit::YearMonth) => { + typed_cast!(array, index, IntervalYearMonthArray, IntervalYearMonth) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + typed_cast!( + array, + index, + IntervalMonthDayNanoArray, + IntervalMonthDayNano + ) + } other => { return Err(DataFusionError::NotImplemented(format!( "Can't create a scalar from array of type \"{other:?}\"" From 476ab85d62b3b4691b0a0db21d5b88a8b557eaad Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 4 Apr 2023 14:20:47 +0300 Subject: [PATCH 38/55] minor changes --- .../physical-expr/src/expressions/binary.rs | 8 +++---- .../physical-expr/src/expressions/datetime.rs | 24 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index c074a194d05c..d4ca84699af7 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1252,7 +1252,7 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { let prim_array = as_timestamp_second_array(&array)?; let ret: PrimitiveArray = - arrow::compute::try_unary(prim_array, |lhs| { + try_unary(prim_array, |lhs| { let (parsed_lhs_tz, parsed_rhs_tz) = (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); let (naive_lhs, naive_rhs) = calculate_naives::( @@ -1274,7 +1274,7 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { let prim_array = as_timestamp_millisecond_array(&array)?; let ret: PrimitiveArray = - arrow::compute::try_unary(prim_array, |lhs| { + try_unary(prim_array, |lhs| { let (parsed_lhs_tz, parsed_rhs_tz) = (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); let (naive_lhs, naive_rhs) = calculate_naives::( @@ -1296,7 +1296,7 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { let prim_array = as_timestamp_microsecond_array(&array)?; let ret: PrimitiveArray = - arrow::compute::try_unary(prim_array, |lhs| { + try_unary(prim_array, |lhs| { let (parsed_lhs_tz, parsed_rhs_tz) = (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); let (naive_lhs, naive_rhs) = calculate_naives::( @@ -1318,7 +1318,7 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { let prim_array = as_timestamp_nanosecond_array(&array)?; let ret: PrimitiveArray = - arrow::compute::try_unary(prim_array, |lhs| { + try_unary(prim_array, |lhs| { let (parsed_lhs_tz, parsed_rhs_tz) = (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); let (naive_lhs, naive_rhs) = calculate_naives::( diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index a0b225dd9693..4cb05ce043ba 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -238,18 +238,6 @@ pub fn evaluate_temporal_array( scalar: &ScalarValue, ) -> Result { match (array.data_type(), scalar.get_datatype()) { - // Timestamp - Timestamp - (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) if sign == -1 => { - ts_scalar_ts_op(array, scalar) - } - // Interval +- Interval - (DataType::Interval(_), DataType::Interval(_)) => { - interval_scalar_interval_op(array, sign, scalar) - } - // Timestamp +- Interval - (DataType::Timestamp(_, _), DataType::Interval(_)) => { - ts_scalar_interval_op(array, sign, scalar) - } // Date +- Interval (DataType::Date32, DataType::Interval(_)) => { let array = as_date32_array(&array)?; @@ -265,6 +253,18 @@ pub fn evaluate_temporal_array( })?) as ArrayRef; Ok(ColumnarValue::Array(ret)) } + // Timestamp - Timestamp + (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) if sign == -1 => { + ts_scalar_ts_op(array, scalar) + } + // Interval +- Interval + (DataType::Interval(_), DataType::Interval(_)) => { + interval_scalar_interval_op(array, sign, scalar) + } + // Timestamp +- Interval + (DataType::Timestamp(_, _), DataType::Interval(_)) => { + ts_scalar_interval_op(array, sign, scalar) + } (_, _) => Err(DataFusionError::Execution(format!( "Invalid lhs type for DateIntervalExpr: {}", array.data_type() From 5aa05dbdd2c849e573b1f9a142b34b51679ef464 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 4 Apr 2023 14:38:19 +0300 Subject: [PATCH 39/55] simplifications --- .../src/intervals/interval_aritmetic.rs | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs index c3160bf742d1..c24c96137c08 100644 --- a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs +++ b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs @@ -202,13 +202,8 @@ impl Interval { /// one can choose single values arbitrarily from each of the operands. pub fn add>(&self, other: T) -> Result { let rhs = other.borrow(); - let mut datatype = - coerce_types(&self.get_datatype(), &Operator::Minus, &rhs.get_datatype()); - if datatype.is_err() { - datatype = - coerce_types(&rhs.get_datatype(), &Operator::Minus, &self.get_datatype()); - } - let datatype = datatype?; + let datatype = + coerce_types(&self.get_datatype(), &Operator::Plus, &rhs.get_datatype())?; let lower = if self.lower.is_null() || rhs.lower.is_null() { ScalarValue::try_from(&datatype) } else { @@ -228,21 +223,15 @@ impl Interval { /// if one can choose single values arbitrarily from each of the operands. pub fn sub>(&self, other: T) -> Result { let rhs = other.borrow(); + let datatype = + coerce_types(&self.get_datatype(), &Operator::Minus, &rhs.get_datatype())?; let lower = if self.lower.is_null() || rhs.upper.is_null() { - ScalarValue::try_from(coerce_types( - &self.lower.get_datatype(), - &Operator::Minus, - &rhs.lower.get_datatype(), - )?) + ScalarValue::try_from(&datatype) } else { self.lower.sub(&rhs.upper) }?; let upper = if self.upper.is_null() || rhs.lower.is_null() { - ScalarValue::try_from(coerce_types( - &self.upper.get_datatype(), - &Operator::Minus, - &rhs.upper.get_datatype(), - )?) + ScalarValue::try_from(&datatype) } else { self.upper.sub(&rhs.lower) }?; From a3e0f9542ab12efcbc0af9a44c4fb683259fc1f6 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 4 Apr 2023 14:48:48 +0300 Subject: [PATCH 40/55] update lock file --- datafusion-cli/Cargo.lock | 262 ++++++++++++++++---------------------- 1 file changed, 113 insertions(+), 149 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index de3b57344564..273267ebb908 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -74,16 +74,16 @@ checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" dependencies = [ "ahash", "arrow-arith", - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", + "arrow-array", + "arrow-buffer", "arrow-cast", "arrow-csv", - "arrow-data 36.0.0", + "arrow-data", "arrow-ipc", "arrow-json", "arrow-ord", "arrow-row", - "arrow-schema 36.0.0", + "arrow-schema", "arrow-select", "arrow-string", ] @@ -94,29 +94,12 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", - "chrono", - "half", - "num", -] - -[[package]] -name = "arrow-array" -version = "34.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d35d5475e65c57cffba06d0022e3006b677515f99b54af33a7cd54f6cdd4a5b5" -dependencies = [ - "ahash", - "arrow-buffer 34.0.0", - "arrow-data 34.0.0", - "arrow-schema 34.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", - "chrono-tz", "half", - "hashbrown 0.13.2", "num", ] @@ -127,9 +110,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" dependencies = [ "ahash", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "chrono-tz", "half", @@ -137,16 +120,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-buffer" -version = "34.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b4ec72eda7c0207727df96cf200f539749d736b21f3e782ece113e18c1a0a7" -dependencies = [ - "half", - "num", -] - [[package]] name = "arrow-buffer" version = "36.0.0" @@ -163,10 +136,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "arrow-select", "chrono", "comfy-table", @@ -180,11 +153,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", + "arrow-array", + "arrow-buffer", "arrow-cast", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-data", + "arrow-schema", "chrono", "csv", "csv-core", @@ -193,26 +166,14 @@ dependencies = [ "regex", ] -[[package]] -name = "arrow-data" -version = "34.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27cc673ee6989ea6e4b4e8c7d461f7e06026a096c8f0b1a7288885ff71ae1e56" -dependencies = [ - "arrow-buffer 34.0.0", - "arrow-schema 34.0.0", - "half", - "num", -] - [[package]] name = "arrow-data" version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" dependencies = [ - "arrow-buffer 36.0.0", - "arrow-schema 36.0.0", + "arrow-buffer", + "arrow-schema", "half", "num", ] @@ -223,11 +184,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", + "arrow-array", + "arrow-buffer", "arrow-cast", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-data", + "arrow-schema", "flatbuffers", ] @@ -237,11 +198,11 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", + "arrow-array", + "arrow-buffer", "arrow-cast", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-data", + "arrow-schema", "chrono", "half", "indexmap", @@ -256,10 +217,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "arrow-select", "half", "num", @@ -272,20 +233,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" dependencies = [ "ahash", - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", "hashbrown 0.13.2", ] -[[package]] -name = "arrow-schema" -version = "34.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64951898473bfb8e22293e83a44f02874d2257514d49cd95f9aa4afcff183fbc" - [[package]] name = "arrow-schema" version = "36.0.0" @@ -298,10 +253,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num", ] @@ -311,10 +266,10 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" dependencies = [ - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", - "arrow-data 36.0.0", - "arrow-schema 36.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "arrow-select", "regex", "regex-syntax", @@ -346,7 +301,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -621,9 +576,9 @@ checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" @@ -704,7 +659,7 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -721,7 +676,7 @@ checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -806,7 +761,7 @@ name = "datafusion-common" version = "21.1.0" dependencies = [ "arrow", - "arrow-array 36.0.0", + "arrow-array", "chrono", "num_cpus", "object_store", @@ -900,7 +855,7 @@ dependencies = [ name = "datafusion-sql" version = "21.1.0" dependencies = [ - "arrow-schema 36.0.0", + "arrow-schema", "datafusion-common", "datafusion-expr", "log", @@ -1147,7 +1102,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -1360,9 +1315,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.55" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "716f12fbcfac6ffab0a5e9ec51d0a0ff70503742bb2dc7b99396394c9dc323f0" +checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1539,9 +1494,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.140" +version = "0.2.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" +checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" [[package]] name = "libm" @@ -1551,9 +1506,9 @@ checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "libmimalloc-sys" -version = "0.1.30" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8c7cbf8b89019683667e347572e6d55a7df7ea36b0c4ce69961b0cde67b174" +checksum = "43a558e3d911bc3c7bfc8c78bc580b404d6e51c1cefbf656e176a94b49b0df40" dependencies = [ "cc", "libc", @@ -1641,9 +1596,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "mimalloc" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcb174b18635f7561a0c6c9fc2ce57218ac7523cf72c50af80e2d79ab8f3ba1" +checksum = "3d88dad3f985ec267a3fcb7a1726f5cb1a7e8cad8b646e70a84f967210df23da" dependencies = [ "libmimalloc-sys", ] @@ -1785,9 +1740,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1ea8f683b4f89a64181393742c041520a1a87e9775e6b4c0dd5a3281af05fc6" +checksum = "ec9cd6ca25e796a49fa242876d1c4de36a24a6da5258e9f0bc062dbf5e81c53b" dependencies = [ "async-trait", "base64", @@ -1862,12 +1817,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" dependencies = [ "ahash", - "arrow-array 36.0.0", - "arrow-buffer 36.0.0", + "arrow-array", + "arrow-buffer", "arrow-cast", - "arrow-data 36.0.0", + "arrow-data", "arrow-ipc", - "arrow-schema 36.0.0", + "arrow-schema", "arrow-select", "base64", "brotli", @@ -2013,18 +1968,18 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.54" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e472a104799c74b514a57226160104aa483546de37e839ec50e3c2e41dd87534" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] [[package]] name = "quick-xml" -version = "0.27.1" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffc053f057dd768a56f62cd7e434c42c831d296968997e9ac1f76ea7c2d14c41" +checksum = "e5c1a97b1bc42b1d550bfb48d4262153fe400a12bab1511821736f7eac76d7e2" dependencies = [ "memchr", "serde", @@ -2192,9 +2147,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.5" +version = "0.37.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e78cc525325c06b4a7ff02db283472f3c042b7ff0c391f96c6d5ac6f4f91b75" +checksum = "2aae838e49b3d63e9274e1c01833cc8139d3fec468c3b84688c628f44b1ae11d" dependencies = [ "bitflags", "errno", @@ -2320,7 +2275,7 @@ checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -2499,9 +2454,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.12" +version = "2.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79d9531f94112cfc3e4c8f5f02cb2b58f72c97b7efd85f70203cc6d8efda5927" +checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec" dependencies = [ "proc-macro2", "quote", @@ -2553,7 +2508,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -2617,7 +2572,7 @@ checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.12", + "syn 2.0.13", ] [[package]] @@ -2954,17 +2909,26 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2649ff315bee4c98757f15dac226efe3d81927adbb6e882084bb1ee3e0c330a7" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.47.0", + "windows-targets 0.48.0", ] [[package]] name = "windows-sys" version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ "windows_aarch64_gnullvm 0.42.2", @@ -2978,17 +2942,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f8996d3f43b4b2d44327cd71b7b0efd1284ab60e6e9d0e8b630e18555d87d3e" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm 0.47.0", - "windows_aarch64_msvc 0.47.0", - "windows_i686_gnu 0.47.0", - "windows_i686_msvc 0.47.0", - "windows_x86_64_gnu 0.47.0", - "windows_x86_64_gnullvm 0.47.0", - "windows_x86_64_msvc 0.47.0", + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] @@ -2999,9 +2963,9 @@ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "831d567d53d4f3cb1db332b68e6e2b6260228eb4d99a777d8b2e8ed794027c90" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" @@ -3011,9 +2975,9 @@ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_aarch64_msvc" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a42d54a417c60ce4f0e31661eed628f0fa5aca73448c093ec4d45fab4c51cdf" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" @@ -3023,9 +2987,9 @@ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_gnu" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1925beafdbb22201a53a483db861a5644123157c1c3cee83323a2ed565d71e3" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" @@ -3035,9 +2999,9 @@ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_i686_msvc" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a8ef8f2f1711b223947d9b69b596cf5a4e452c930fb58b6fc3fdae7d0ec6b31" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" @@ -3047,9 +3011,9 @@ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnu" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acaa0c2cf0d2ef99b61c308a0c3dbae430a51b7345dedec470bd8f53f5a3642" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" @@ -3059,9 +3023,9 @@ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" [[package]] name = "windows_x86_64_gnullvm" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a0628f71be1d11e17ca4a0e9e15b3a5180f6fbf1c2d55e3ba3f850378052c1" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" @@ -3071,9 +3035,9 @@ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "windows_x86_64_msvc" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d6e62c256dc6d40b8c8707df17df8d774e60e39db723675241e7c15e910bce7" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "winreg" From 9ef7c19437034010bd247ae2cb8558d05c2573c6 Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Tue, 4 Apr 2023 15:23:20 +0300 Subject: [PATCH 41/55] Refactoring tests You can add a millisecond array as well, but I used Nano. --- .../joins/symmetric_hash_join.rs | 122 +++++++++--------- .../physical-expr/src/intervals/test_utils.rs | 8 +- 2 files changed, 62 insertions(+), 68 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index b868a63c0429..51bc24b96d4f 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1427,7 +1427,7 @@ impl SymmetricHashJoinStream { mod tests { use std::fs::File; - use arrow::array::{ArrayRef, IntervalDayTimeArray, TimestampMillisecondArray}; + use arrow::array::{ArrayRef, IntervalDayTimeArray}; use arrow::array::{Int32Array, TimestampNanosecondArray}; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; @@ -1692,10 +1692,10 @@ mod tests { Operator::Minus, Operator::Minus, Operator::Minus, - 5, - 4, - 3, - 2, + 5000000, + 4000000, + 3000000, + 2000000, schema, ), _ => unreachable!(), @@ -1747,9 +1747,13 @@ mod tests { let time = Arc::new(TimestampNanosecondArray::from( initial_range + .clone() .map(|x| 1664264591000000000 + (5000000000 * (x as i64))) .collect::>(), )); + let interval_time: ArrayRef = Arc::new(IntervalDayTimeArray::from( + initial_range.map(|x| x as i64 * 15).collect::>(), + )); let left = RecordBatch::try_from_iter(vec![ ("la1", ordered.clone()), @@ -1761,6 +1765,7 @@ mod tests { ("l_asc_null_first", ordered_asc_null_first.clone()), ("l_asc_null_last", ordered_asc_null_last.clone()), ("l_desc_null_first", ordered_desc_null_first.clone()), + ("li1", interval_time.clone()), ])?; let right = RecordBatch::try_from_iter(vec![ ("ra1", ordered.clone()), @@ -1772,41 +1777,7 @@ mod tests { ("r_asc_null_first", ordered_asc_null_first), ("r_asc_null_last", ordered_asc_null_last), ("r_desc_null_first", ordered_desc_null_first), - ])?; - Ok((left, right)) - } - - fn build_sides_record_batches_temporal( - table_size: i32, - key_cardinality: i32, - ) -> Result<(RecordBatch, RecordBatch)> { - let initial_range = 0..table_size; - let ordered: ArrayRef = Arc::new(TimestampMillisecondArray::from( - initial_range - .clone() - .map(|x| x as i64 + 1672531200000) - .collect::>(), - )); - let ordered2: ArrayRef = Arc::new(IntervalDayTimeArray::from( - initial_range - .clone() - .map(|x| x as i64 * 15) - .collect::>(), - )); - let cardinality_key = Arc::new(Int32Array::from_iter( - initial_range - .map(|x| x % key_cardinality) - .collect::>(), - )); - let left = RecordBatch::try_from_iter(vec![ - ("la1", ordered.clone()), - ("la2", ordered2.clone()), - ("lc1", cardinality_key.clone()), - ])?; - let right = RecordBatch::try_from_iter(vec![ - ("ra1", ordered), - ("ra2", ordered2), - ("rc1", cardinality_key), + ("ri1", interval_time), ])?; Ok((left, right)) } @@ -2551,29 +2522,26 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn with_temporal_columns() -> Result<()> { - let cardinality = 10; let join_type = JoinType::Full; let config = SessionConfig::new().with_repartition_joins(false); let session_ctx = SessionContext::with_config(config); let task_ctx = session_ctx.task_ctx(); - let (left_batch, right_batch) = - build_sides_record_batches_temporal(TABLE_SIZE, cardinality)?; + let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; let left_schema = &left_batch.schema(); let right_schema = &right_batch.schema(); let on = vec![( Column::new_with_schema("lc1", left_schema)?, Column::new_with_schema("rc1", right_schema)?, )]; - // test for timestamp - interval let left_sorted = vec![PhysicalSortExpr { - expr: col("la1", left_schema)?, + expr: col("lt1", left_schema)?, options: SortOptions { descending: false, nulls_first: true, }, }]; let right_sorted = vec![PhysicalSortExpr { - expr: col("ra1", right_schema)?, + expr: col("rt1", right_schema)?, options: SortOptions { descending: false, nulls_first: true, @@ -2589,12 +2557,12 @@ mod tests { let intermediate_schema = Schema::new(vec![ Field::new( "left", - DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), Field::new( "right", - DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), ]); @@ -2606,27 +2574,40 @@ mod tests { ); let column_indices = vec![ ColumnIndex { - index: 0, + index: 3, side: JoinSide::Left, }, ColumnIndex { - index: 0, + index: 3, side: JoinSide::Right, }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; - - // test for timestamp - timestamp + Ok(()) + } + #[tokio::test(flavor = "multi_thread")] + async fn with_temporal_columns_2() -> Result<()> { + let join_type = JoinType::Full; + let config = SessionConfig::new().with_repartition_joins(false); + let session_ctx = SessionContext::with_config(config); + let task_ctx = session_ctx.task_ctx(); + let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; + let left_schema = &left_batch.schema(); + let right_schema = &right_batch.schema(); + let on = vec![( + Column::new_with_schema("lc1", left_schema)?, + Column::new_with_schema("rc1", right_schema)?, + )]; let left_sorted = vec![PhysicalSortExpr { - expr: col("la1", left_schema)?, + expr: col("lt1", left_schema)?, options: SortOptions { descending: false, nulls_first: true, }, }]; let right_sorted = vec![PhysicalSortExpr { - expr: col("ra1", right_schema)?, + expr: col("rt1", right_schema)?, options: SortOptions { descending: false, nulls_first: true, @@ -2642,12 +2623,12 @@ mod tests { let intermediate_schema = Schema::new(vec![ Field::new( "left", - DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), Field::new( "right", - DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), ]); @@ -2659,27 +2640,40 @@ mod tests { ); let column_indices = vec![ ColumnIndex { - index: 0, + index: 3, side: JoinSide::Left, }, ColumnIndex { - index: 0, + index: 3, side: JoinSide::Right, }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; - - // test for interval - interval + Ok(()) + } + #[tokio::test(flavor = "multi_thread")] + async fn with_temporal_columns_3() -> Result<()> { + let join_type = JoinType::Full; + let config = SessionConfig::new().with_repartition_joins(false); + let session_ctx = SessionContext::with_config(config); + let task_ctx = session_ctx.task_ctx(); + let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; + let left_schema = &left_batch.schema(); + let right_schema = &right_batch.schema(); + let on = vec![( + Column::new_with_schema("lc1", left_schema)?, + Column::new_with_schema("rc1", right_schema)?, + )]; let left_sorted = vec![PhysicalSortExpr { - expr: col("la2", left_schema)?, + expr: col("li1", left_schema)?, options: SortOptions { descending: false, nulls_first: true, }, }]; let right_sorted = vec![PhysicalSortExpr { - expr: col("ra2", right_schema)?, + expr: col("ri1", right_schema)?, options: SortOptions { descending: false, nulls_first: true, @@ -2699,11 +2693,11 @@ mod tests { ); let column_indices = vec![ ColumnIndex { - index: 1, + index: 9, side: JoinSide::Left, }, ColumnIndex { - index: 1, + index: 9, side: JoinSide::Right, }, ]; diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs index 7929b05b7d14..4e40b414bc8f 100644 --- a/datafusion/physical-expr/src/intervals/test_utils.rs +++ b/datafusion/physical-expr/src/intervals/test_utils.rs @@ -146,7 +146,7 @@ pub fn gen_conjunctive_timestamp_expr( DateTimeIntervalExpr::try_new( left_col.clone(), op_1, - Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Arc::new(Literal::new(ScalarValue::TimestampNanosecond( Some(a.into()), None, ))), @@ -158,7 +158,7 @@ pub fn gen_conjunctive_timestamp_expr( DateTimeIntervalExpr::try_new( right_col.clone(), op_2, - Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Arc::new(Literal::new(ScalarValue::TimestampNanosecond( Some(b.into()), None, ))), @@ -170,7 +170,7 @@ pub fn gen_conjunctive_timestamp_expr( DateTimeIntervalExpr::try_new( left_col, op_3, - Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Arc::new(Literal::new(ScalarValue::TimestampNanosecond( Some(c.into()), None, ))), @@ -182,7 +182,7 @@ pub fn gen_conjunctive_timestamp_expr( DateTimeIntervalExpr::try_new( right_col, op_4, - Arc::new(Literal::new(ScalarValue::TimestampMillisecond( + Arc::new(Literal::new(ScalarValue::TimestampNanosecond( Some(d.into()), None, ))), From 7fd0c10671af75970df64c3f720bfbde9c1a8216 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 4 Apr 2023 16:13:32 +0300 Subject: [PATCH 42/55] bug detected --- datafusion/common/src/scalar.rs | 48 ++------ .../joins/symmetric_hash_join.rs | 45 ++++---- .../physical-expr/src/expressions/binary.rs | 3 + .../physical-expr/src/intervals/test_utils.rs | 104 ++---------------- 4 files changed, 51 insertions(+), 149 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index b99cebd7d280..9cf795d57690 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -1121,12 +1121,8 @@ pub fn seconds_add_array( #[inline] pub fn milliseconds_add(ts_ms: i64, scalar: &ScalarValue, sign: i32) -> Result { - let mut secs = ts_ms / 1000; - let mut nsecs = ((ts_ms % 1000) * 1_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_ms.div_euclid(1000); + let nsecs = ts_ms.rem_euclid(1000) * 1_000_000; do_date_time_math(secs, nsecs as u32, scalar, sign).map(|dt| dt.timestamp_millis()) } @@ -1136,24 +1132,16 @@ pub fn milliseconds_add_array( interval: i128, sign: i32, ) -> Result { - let mut secs = ts_ms / 1000; - let mut nsecs = ((ts_ms % 1000) * 1_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_ms.div_euclid(1000); + let nsecs = ts_ms.rem_euclid(1000) * 1_000_000; do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_millis()) } #[inline] pub fn microseconds_add(ts_us: i64, scalar: &ScalarValue, sign: i32) -> Result { - let mut secs = ts_us / 1_000_000; - let mut nsecs = ((ts_us % 1_000_000) * 1000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_us.div_euclid(1_000_000); + let nsecs = ts_us.rem_euclid(1_000_000) * 1_000; do_date_time_math(secs, nsecs as u32, scalar, sign) .map(|dt| dt.timestamp_nanos() / 1000) } @@ -1164,24 +1152,16 @@ pub fn microseconds_add_array( interval: i128, sign: i32, ) -> Result { - let mut secs = ts_us / 1_000_000; - let mut nsecs = ((ts_us % 1_000_000) * 1000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_us.div_euclid(1_000_000); + let nsecs = ts_us.rem_euclid(1_000_000) * 1_000; do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_nanos() / 1000) } #[inline] pub fn nanoseconds_add(ts_ns: i64, scalar: &ScalarValue, sign: i32) -> Result { - let mut secs = ts_ns / 1_000_000_000; - let mut nsecs = (ts_ns % 1_000_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_ns.div_euclid(1_000_000_000); + let nsecs = ts_ns.rem_euclid(1_000_000_000); do_date_time_math(secs, nsecs as u32, scalar, sign).map(|dt| dt.timestamp_nanos()) } @@ -1191,12 +1171,8 @@ pub fn nanoseconds_add_array( interval: i128, sign: i32, ) -> Result { - let mut secs = ts_ns / 1_000_000_000; - let mut nsecs = (ts_ns % 1_000_000_000) as i32; - if nsecs < 0 { - secs -= 1; - nsecs += 1_000_000_000; - } + let secs = ts_ns.div_euclid(1_000_000_000); + let nsecs = ts_ns.rem_euclid(1_000_000_000); do_date_time_math_array::(secs, nsecs as u32, interval, sign) .map(|dt| dt.timestamp_nanos()) } diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 51bc24b96d4f..4601369a902b 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1428,18 +1428,17 @@ mod tests { use std::fs::File; use arrow::array::{ArrayRef, IntervalDayTimeArray}; - use arrow::array::{Int32Array, TimestampNanosecondArray}; + use arrow::array::{Int32Array, TimestampMillisecondArray}; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; - use arrow::util::pretty::pretty_format_batches; + use arrow::util::pretty::{pretty_format_batches, print_batches}; use rstest::*; use tempfile::TempDir; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{binary, col, Column}; use datafusion_physical_expr::intervals::test_utils::{ - gen_conjunctive_interval_expr, gen_conjunctive_numeric_expr, - gen_conjunctive_timestamp_expr, + gen_conjunctive_numeric_expr, gen_conjunctive_temporal_expr, }; use datafusion_physical_expr::PhysicalExpr; @@ -1672,30 +1671,30 @@ mod tests { schema: &Schema, ) -> Arc { match expr_id { - 0 => gen_conjunctive_interval_expr( + 0 => gen_conjunctive_temporal_expr( left_col, right_col, Operator::Minus, Operator::Minus, Operator::Minus, Operator::Minus, - 5, - 4, - 3, - 2, + ScalarValue::IntervalDayTime(Some(100)), // 100 ms + ScalarValue::IntervalDayTime(Some(200)), // 200 ms + ScalarValue::IntervalDayTime(Some(450)), // 450 ms + ScalarValue::IntervalDayTime(Some(300)), // 300 ms schema, ), - 1 => gen_conjunctive_timestamp_expr( + 1 => gen_conjunctive_temporal_expr( left_col, right_col, Operator::Minus, Operator::Minus, Operator::Minus, Operator::Minus, - 5000000, - 4000000, - 3000000, - 2000000, + ScalarValue::TimestampMillisecond(Some(1672574403000), None), // 2023-01-01:12.00.03 + ScalarValue::TimestampMillisecond(Some(1672574401000), None), // 2023-01-01:12.00.01 + ScalarValue::TimestampMillisecond(Some(1672574400000), None), // 2023-01-01:12.00.00 + ScalarValue::TimestampMillisecond(Some(1672574402000), None), // 2023-01-01:12.00.02 schema, ), _ => unreachable!(), @@ -1745,14 +1744,16 @@ mod tests { .collect::>>() })); - let time = Arc::new(TimestampNanosecondArray::from( + let time = Arc::new(TimestampMillisecondArray::from( initial_range .clone() - .map(|x| 1664264591000000000 + (5000000000 * (x as i64))) + .map(|x| x as i64 + 1672531200000) // x + 2023-01-01:00.00.00 .collect::>(), )); let interval_time: ArrayRef = Arc::new(IntervalDayTimeArray::from( - initial_range.map(|x| x as i64 * 15).collect::>(), + initial_range + .map(|x| x as i64 * 100) // x * 100ms + .collect::>(), )); let left = RecordBatch::try_from_iter(vec![ @@ -1831,6 +1832,8 @@ mod tests { left, right, on, filter, &join_type, false, task_ctx, ) .await?; + print_batches(&first_batches); + print_batches(&second_batches); compare_batches(&first_batches, &second_batches); Ok(()) } @@ -2557,12 +2560,12 @@ mod tests { let intermediate_schema = Schema::new(vec![ Field::new( "left", - DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), false, ), Field::new( "right", - DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), false, ), ]); @@ -2623,12 +2626,12 @@ mod tests { let intermediate_schema = Schema::new(vec![ Field::new( "left", - DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), false, ), Field::new( "right", - DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), false, ), ]); diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index d4ca84699af7..17c37b2eb3b8 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1396,6 +1396,9 @@ pub fn ts_scalar_interval_op( }; Ok(ColumnarValue::Array(ret)) } +/// This function handles the Interval - Interval operations, +/// where the first one is an array, and the second one is a scalar, +/// hence the result is also an interval array. pub fn interval_scalar_interval_op( array: ArrayRef, sign: i32, diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs index 4e40b414bc8f..7219b474efac 100644 --- a/datafusion/physical-expr/src/intervals/test_utils.rs +++ b/datafusion/physical-expr/src/intervals/test_utils.rs @@ -69,87 +69,26 @@ pub fn gen_conjunctive_numeric_expr( #[allow(clippy::too_many_arguments)] /// This test function generates a conjunctive statement with -/// two timestamp terms with the following form: +/// two scalar values with the following form: /// left_col (op_1) a > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d -pub fn gen_conjunctive_interval_expr( +pub fn gen_conjunctive_temporal_expr( left_col: Arc, right_col: Arc, op_1: Operator, op_2: Operator, op_3: Operator, op_4: Operator, - a: i32, - b: i32, - c: i32, - d: i32, - schema: &Schema, -) -> Arc { - let left_and_1 = Arc::new( - DateTimeIntervalExpr::try_new( - left_col.clone(), - op_1, - Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(a.into())))), - schema, - ) - .unwrap(), - ); - let left_and_2 = Arc::new( - DateTimeIntervalExpr::try_new( - right_col.clone(), - op_2, - Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(b.into())))), - schema, - ) - .unwrap(), - ); - let right_and_1 = Arc::new( - DateTimeIntervalExpr::try_new( - left_col, - op_3, - Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(c.into())))), - schema, - ) - .unwrap(), - ); - let right_and_2 = Arc::new( - DateTimeIntervalExpr::try_new( - right_col, - op_4, - Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(d.into())))), - schema, - ) - .unwrap(), - ); - let left_expr = Arc::new(BinaryExpr::new(left_and_1, Operator::Gt, left_and_2)); - let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); - Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr)) -} - -#[allow(clippy::too_many_arguments)] -/// This test function generates a conjunctive statement with -/// one timestamp and one interval term with the following form: -/// left_col (op_1) a > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d -pub fn gen_conjunctive_timestamp_expr( - left_col: Arc, - right_col: Arc, - op_1: Operator, - op_2: Operator, - op_3: Operator, - op_4: Operator, - a: i32, - b: i32, - c: i32, - d: i32, + a: ScalarValue, + b: ScalarValue, + c: ScalarValue, + d: ScalarValue, schema: &Schema, ) -> Arc { let left_and_1 = Arc::new( DateTimeIntervalExpr::try_new( left_col.clone(), op_1, - Arc::new(Literal::new(ScalarValue::TimestampNanosecond( - Some(a.into()), - None, - ))), + Arc::new(Literal::new(a)), schema, ) .unwrap(), @@ -158,37 +97,18 @@ pub fn gen_conjunctive_timestamp_expr( DateTimeIntervalExpr::try_new( right_col.clone(), op_2, - Arc::new(Literal::new(ScalarValue::TimestampNanosecond( - Some(b.into()), - None, - ))), + Arc::new(Literal::new(b)), schema, ) .unwrap(), ); let right_and_1 = Arc::new( - DateTimeIntervalExpr::try_new( - left_col, - op_3, - Arc::new(Literal::new(ScalarValue::TimestampNanosecond( - Some(c.into()), - None, - ))), - schema, - ) - .unwrap(), + DateTimeIntervalExpr::try_new(left_col, op_3, Arc::new(Literal::new(c)), schema) + .unwrap(), ); let right_and_2 = Arc::new( - DateTimeIntervalExpr::try_new( - right_col, - op_4, - Arc::new(Literal::new(ScalarValue::TimestampNanosecond( - Some(d.into()), - None, - ))), - schema, - ) - .unwrap(), + DateTimeIntervalExpr::try_new(right_col, op_4, Arc::new(Literal::new(d)), schema) + .unwrap(), ); let left_expr = Arc::new(BinaryExpr::new(left_and_1, Operator::Gt, left_and_2)); let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); From 6b0da545db970ad1b3de2175bfc921a52db6b99f Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 5 Apr 2023 14:20:49 +0300 Subject: [PATCH 43/55] bug fixed --- .../joins/symmetric_hash_join.rs | 4 +-- .../physical-expr/src/expressions/binary.rs | 4 +-- .../src/intervals/interval_aritmetic.rs | 28 +++++++++++++------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 4601369a902b..f7392b5420d2 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1431,7 +1431,7 @@ mod tests { use arrow::array::{Int32Array, TimestampMillisecondArray}; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; - use arrow::util::pretty::{pretty_format_batches, print_batches}; + use arrow::util::pretty::pretty_format_batches; use rstest::*; use tempfile::TempDir; @@ -1832,8 +1832,6 @@ mod tests { left, right, on, filter, &join_type, false, task_ctx, ) .await?; - print_batches(&first_batches); - print_batches(&second_batches); compare_batches(&first_batches, &second_batches); Ok(()) } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 17c37b2eb3b8..f376756d884b 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1447,7 +1447,7 @@ pub fn interval_scalar_interval_op( ) => { let array = as_interval_dt_array(&array)?; let ret: PrimitiveArray = - unary(array, |lhs| op_dt(*rhs, lhs, sign)); + unary(array, |lhs| op_dt(lhs, *rhs, sign)); Arc::new(ret) as ArrayRef } ( @@ -1483,7 +1483,7 @@ pub fn interval_scalar_interval_op( ) => { let array = as_interval_mdn_array(&array)?; let ret: PrimitiveArray = - unary(array, |lhs| op_mdn(*rhs, lhs, sign)); + unary(array, |lhs| op_mdn(lhs, *rhs, sign)); Arc::new(ret) as ArrayRef } _ => Err(DataFusionError::Internal(format!( diff --git a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs index c24c96137c08..94ac4e9a81ee 100644 --- a/datafusion/physical-expr/src/intervals/interval_aritmetic.rs +++ b/datafusion/physical-expr/src/intervals/interval_aritmetic.rs @@ -202,15 +202,21 @@ impl Interval { /// one can choose single values arbitrarily from each of the operands. pub fn add>(&self, other: T) -> Result { let rhs = other.borrow(); - let datatype = - coerce_types(&self.get_datatype(), &Operator::Plus, &rhs.get_datatype())?; let lower = if self.lower.is_null() || rhs.lower.is_null() { - ScalarValue::try_from(&datatype) + ScalarValue::try_from(&coerce_types( + &self.get_datatype(), + &Operator::Plus, + &rhs.get_datatype(), + )?) } else { self.lower.add(&rhs.lower) }?; let upper = if self.upper.is_null() || rhs.upper.is_null() { - ScalarValue::try_from(&datatype) + ScalarValue::try_from(coerce_types( + &self.get_datatype(), + &Operator::Plus, + &rhs.get_datatype(), + )?) } else { self.upper.add(&rhs.upper) }?; @@ -223,15 +229,21 @@ impl Interval { /// if one can choose single values arbitrarily from each of the operands. pub fn sub>(&self, other: T) -> Result { let rhs = other.borrow(); - let datatype = - coerce_types(&self.get_datatype(), &Operator::Minus, &rhs.get_datatype())?; let lower = if self.lower.is_null() || rhs.upper.is_null() { - ScalarValue::try_from(&datatype) + ScalarValue::try_from(coerce_types( + &self.get_datatype(), + &Operator::Minus, + &rhs.get_datatype(), + )?) } else { self.lower.sub(&rhs.upper) }?; let upper = if self.upper.is_null() || rhs.lower.is_null() { - ScalarValue::try_from(&datatype) + ScalarValue::try_from(coerce_types( + &self.get_datatype(), + &Operator::Minus, + &rhs.get_datatype(), + )?) } else { self.upper.sub(&rhs.lower) }?; From 8bee5d9ed05926c6f4dd51d1865c82b32f8b876c Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 5 Apr 2023 14:21:21 +0300 Subject: [PATCH 44/55] update cargo --- datafusion-cli/Cargo.lock | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 273267ebb908..c901e0913e4a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -962,7 +962,7 @@ checksum = "50d6a0976c999d473fe89ad888d5a284e55366d9dc9038b1ba2aa15128c4afa0" dependencies = [ "errno-dragonfly", "libc", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -1002,7 +1002,7 @@ checksum = "9799aefb4a2e4a01cc47610b1dd47c18ab13d991f27bbcaed9296f5a53d5cbad" dependencies = [ "cfg-if", "rustix", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -1374,13 +1374,13 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "io-lifetimes" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ "hermit-abi 0.3.1", "libc", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -1627,7 +1627,7 @@ dependencies = [ "libc", "log", "wasi", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -1807,7 +1807,7 @@ dependencies = [ "libc", "redox_syscall 0.2.16", "smallvec", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -2156,7 +2156,7 @@ dependencies = [ "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -2473,7 +2473,7 @@ dependencies = [ "fastrand", "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -2561,7 +2561,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -2925,6 +2925,15 @@ dependencies = [ "windows-targets 0.42.2", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", +] + [[package]] name = "windows-targets" version = "0.42.2" From aa0f2b555f7346cdcc51425f867d0816306fbc53 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 5 Apr 2023 16:31:42 +0300 Subject: [PATCH 45/55] tests added --- .../joins/symmetric_hash_join.rs | 8 +- .../physical-expr/src/expressions/datetime.rs | 202 ++++++++++++++++++ 2 files changed, 206 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index f7392b5420d2..ce4ccffa6e8b 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1678,10 +1678,10 @@ mod tests { Operator::Minus, Operator::Minus, Operator::Minus, - ScalarValue::IntervalDayTime(Some(100)), // 100 ms - ScalarValue::IntervalDayTime(Some(200)), // 200 ms - ScalarValue::IntervalDayTime(Some(450)), // 450 ms - ScalarValue::IntervalDayTime(Some(300)), // 300 ms + ScalarValue::new_interval_dt(0, 100), // 100 ms + ScalarValue::new_interval_dt(0, 200), // 200 ms + ScalarValue::new_interval_dt(0, 450), // 450 ms + ScalarValue::new_interval_dt(0, 300), // 300 ms schema, ), 1 => gen_conjunctive_temporal_expr( diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index 4cb05ce043ba..e5d461e740f8 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -760,6 +760,10 @@ mod tests { use crate::execution_props::ExecutionProps; use arrow::array::{ArrayRef, Date32Builder}; use arrow::datatypes::*; + use arrow_array::{ + IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, + }; use chrono::{Duration, NaiveDate}; use datafusion_common::delta::shift_months; use datafusion_common::{Column, Result, ToDFSchema}; @@ -1154,4 +1158,202 @@ mod tests { let res = cut.evaluate(&batch)?; Ok(res) } + + // In this test, ArrayRef of one element arrays is evaluated with some ScalarValues, + // aiming that evaluate_temporal_array function is working properly and shows the same + // behavior with ScalarValue arithmetic. + fn experiment( + timestamp_array: ArrayRef, + timestamp_scalar: ScalarValue, + interval_array: ArrayRef, + interval_scalar: ScalarValue, + ) -> Result<()> { + // timestamp + interval + if let ColumnarValue::Array(res1) = + evaluate_temporal_array(timestamp_array.clone(), 1, &interval_scalar)? + { + let res2 = timestamp_scalar.add(&interval_scalar)?.to_array(); + assert_eq!( + &res1, &res2, + "Timestamp Scalar={} + Interval Scalar={}", + timestamp_scalar, interval_scalar + ); + } + + // timestamp - interval + if let ColumnarValue::Array(res1) = + evaluate_temporal_array(timestamp_array.clone(), -1, &interval_scalar)? + { + let res2 = timestamp_scalar.sub(&interval_scalar)?.to_array(); + assert_eq!( + &res1, &res2, + "Timestamp Scalar={} - Interval Scalar={}", + timestamp_scalar, interval_scalar + ); + } + + // timestamp - timestamp + if let ColumnarValue::Array(res1) = + evaluate_temporal_array(timestamp_array.clone(), -1, ×tamp_scalar)? + { + let res2 = timestamp_scalar.sub(×tamp_scalar)?.to_array(); + assert_eq!( + &res1, &res2, + "Timestamp Scalar={} - Timestamp Scalar={}", + timestamp_scalar, timestamp_scalar + ); + } + + // interval - interval + if let ColumnarValue::Array(res1) = + evaluate_temporal_array(interval_array.clone(), -1, &interval_scalar)? + { + let res2 = interval_scalar.sub(&interval_scalar)?.to_array(); + assert_eq!( + &res1, &res2, + "Interval Scalar={} - Interval Scalar={}", + interval_scalar, interval_scalar + ); + } + + // interval + interval + if let ColumnarValue::Array(res1) = + evaluate_temporal_array(interval_array, 1, &interval_scalar)? + { + let res2 = interval_scalar.add(&interval_scalar)?.to_array(); + assert_eq!( + &res1, &res2, + "Interval Scalar={} + Interval Scalar={}", + interval_scalar, interval_scalar + ); + } + + Ok(()) + } + #[test] + fn test_evalute_with_scalar() -> Result<()> { + // Timestamp (sec) & Interval (DayTime) + let timestamp_array = Arc::new(TimestampSecondArray::from(vec![ + 1_672_531_200, // 2023-01-01:00.00.00, + ])); + let timestamp_scalar = ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, + ); + let interval_array: ArrayRef = Arc::new(IntervalDayTimeArray::from(vec![1_000])); // 1 sec + let interval_scalar = ScalarValue::new_interval_dt(0, 1_000); + + experiment( + timestamp_array, + timestamp_scalar, + interval_array, + interval_scalar, + )?; + + // Timestamp (millisec) & Interval (DayTime) + let timestamp_array = Arc::new(TimestampMillisecondArray::from(vec![ + 1_672_531_200_000, // 2023-01-01:00.00.00, + ])); + let timestamp_scalar = ScalarValue::TimestampMillisecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_milli_opt(0, 0, 0, 0) + .unwrap() + .timestamp_millis(), + ), + None, + ); + let interval_array: ArrayRef = Arc::new(IntervalDayTimeArray::from(vec![1_000])); // 1 sec + let interval_scalar = ScalarValue::new_interval_dt(0, 1_000); + + experiment( + timestamp_array, + timestamp_scalar, + interval_array, + interval_scalar, + )?; + + // Timestamp (nanosec) & Interval (MonthDayNano) + let timestamp_array = Arc::new(TimestampNanosecondArray::from(vec![ + 1_672_531_200_000_000_000, // 2023-01-01:00.00.00, + ])); + let timestamp_scalar = ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 0) + .unwrap() + .timestamp_nanos(), + ), + None, + ); + let interval_array: ArrayRef = + Arc::new(IntervalMonthDayNanoArray::from(vec![1_000])); // 1 us + let interval_scalar = ScalarValue::new_interval_mdn(0, 0, 1_000); + + experiment( + timestamp_array, + timestamp_scalar, + interval_array, + interval_scalar, + )?; + + // Timestamp (nanosec) & Interval (MonthDayNano), negatively resulting cases + let timestamp_array = Arc::new(TimestampNanosecondArray::from(vec![ + 0, // 2023-01-01:00.00.00, + ])); + let timestamp_scalar = ScalarValue::TimestampNanosecond( + Some( + NaiveDate::from_ymd_opt(1970, 1, 1) + .unwrap() + .and_hms_nano_opt(0, 0, 0, 000) + .unwrap() + .timestamp_nanos(), + ), + None, + ); + let interval_array: ArrayRef = + Arc::new(IntervalMonthDayNanoArray::from(vec![1_000])); // 1 us + let interval_scalar = ScalarValue::new_interval_mdn(0, 0, 1_000); + + experiment( + timestamp_array, + timestamp_scalar, + interval_array, + interval_scalar, + )?; + + // Timestamp (sec) & Interval (YearMonth) + let timestamp_array = Arc::new(TimestampSecondArray::from(vec![ + 1_672_531_200, // 2023-01-01:00.00.00, + ])); + let timestamp_scalar = ScalarValue::TimestampSecond( + Some( + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap() + .timestamp(), + ), + None, + ); + let interval_array: ArrayRef = Arc::new(IntervalYearMonthArray::from(vec![1])); // 1 us + let interval_scalar = ScalarValue::new_interval_ym(0, 1); + + experiment( + timestamp_array, + timestamp_scalar, + interval_array, + interval_scalar, + )?; + + Ok(()) + } } From 6f9b70fd954eed4b8c602835fd147b41e2bad48a Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 6 Apr 2023 10:59:13 +0300 Subject: [PATCH 46/55] minor changes after merge --- datafusion-cli/Cargo.lock | 10 ++--- .../joins/symmetric_hash_join.rs | 39 ++++++++++++++----- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index eae956c5da34..55f37b1d15d8 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -3082,7 +3082,7 @@ version = "0.12.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" dependencies = [ - "zstd-safe 6.0.4+zstd.1.5.4", + "zstd-safe 6.0.5+zstd.1.5.4", ] [[package]] @@ -3097,9 +3097,9 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "6.0.4+zstd.1.5.4" +version = "6.0.5+zstd.1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7afb4b54b8910cf5447638cb54bf4e8a65cbedd783af98b98c62ffe91f185543" +checksum = "d56d9e60b4b1758206c238a10165fbcae3ca37b01744e394c463463f6529d23b" dependencies = [ "libc", "zstd-sys", @@ -3107,9 +3107,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.7+zstd.1.5.4" +version = "2.0.8+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" dependencies = [ "cc", "libc", diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index c815bd2b1db7..43a6c96a90e7 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -2763,8 +2763,8 @@ mod tests { let (left, right) = create_memory_table( left_batch.clone(), right_batch.clone(), - left_sorted, - right_sorted, + Some(left_sorted), + Some(right_sorted), 13, )?; let intermediate_schema = Schema::new(vec![ @@ -2796,7 +2796,15 @@ mod tests { }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); - experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; + experiment( + left, + right, + Some(filter), + join_type, + on.clone(), + task_ctx.clone(), + ) + .await?; Ok(()) } #[tokio::test(flavor = "multi_thread")] @@ -2829,8 +2837,8 @@ mod tests { let (left, right) = create_memory_table( left_batch.clone(), right_batch.clone(), - left_sorted, - right_sorted, + Some(left_sorted), + Some(right_sorted), 13, )?; let intermediate_schema = Schema::new(vec![ @@ -2862,7 +2870,15 @@ mod tests { }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); - experiment(left, right, filter, join_type, on.clone(), task_ctx.clone()).await?; + experiment( + left, + right, + Some(filter), + join_type, + on.clone(), + task_ctx.clone(), + ) + .await?; Ok(()) } #[tokio::test(flavor = "multi_thread")] @@ -2892,8 +2908,13 @@ mod tests { nulls_first: true, }, }]; - let (left, right) = - create_memory_table(left_batch, right_batch, left_sorted, right_sorted, 13)?; + let (left, right) = create_memory_table( + left_batch, + right_batch, + Some(left_sorted), + Some(right_sorted), + 13, + )?; let intermediate_schema = Schema::new(vec![ Field::new("left", DataType::Interval(IntervalUnit::DayTime), false), Field::new("right", DataType::Interval(IntervalUnit::DayTime), false), @@ -2915,7 +2936,7 @@ mod tests { }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); - experiment(left, right, filter, join_type, on, task_ctx).await?; + experiment(left, right, Some(filter), join_type, on, task_ctx).await?; Ok(()) } From 85bd570de978d89ee1b39716694949dcf70a762f Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Fri, 7 Apr 2023 11:35:06 +0300 Subject: [PATCH 47/55] fix after merge --- datafusion/physical-expr/src/expressions/binary.rs | 4 ++-- datafusion/physical-expr/src/expressions/datetime.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 8b8aa41998b1..d30f219bd961 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -55,7 +55,6 @@ use arrow::datatypes::*; use adapter::{eq_dyn, gt_dyn, gt_eq_dyn, lt_dyn, lt_eq_dyn, neq_dyn}; use arrow::compute::kernels::concat_elements::concat_elements_utf8; -use chrono::NaiveDateTime; use datafusion_common::scalar::{ calculate_naives, microseconds_add, microseconds_sub, milliseconds_add, milliseconds_sub, nanoseconds_add, nanoseconds_sub, op_dt, op_dt_mdn, op_mdn, op_ym, @@ -1497,7 +1496,8 @@ pub fn interval_scalar_interval_op( )))?, }; Ok(ColumnarValue::Array(ret)) - +} + // Macros related with timestamp & interval operations macro_rules! ts_sub_op { ($lhs:ident, $rhs:ident, $lhs_tz:ident, $rhs_tz:ident, $coef:expr, $caster:expr, $op:expr, $ts_unit:expr, $mode:expr, $type_out:ty) => {{ diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index bbf7fd213a5c..f412c2848d79 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -20,11 +20,8 @@ use crate::intervals::{apply_operator, Interval}; use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; use arrow::array::{Array, ArrayRef}; -use arrow::compute::unary; -use arrow::datatypes::{ - DataType, Date32Type, Date64Type, Schema, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, -}; +use arrow::compute::try_unary; +use arrow::datatypes::{DataType, Date32Type, Date64Type, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::cast::*; @@ -37,7 +34,10 @@ use std::any::Any; use std::fmt::{Display, Formatter}; use std::sync::Arc; -use super::binary::{interval_array_op, ts_array_op, ts_interval_array_op}; +use super::binary::{ + interval_array_op, interval_scalar_interval_op, ts_array_op, ts_interval_array_op, + ts_scalar_interval_op, ts_scalar_ts_op, +}; /// Perform DATE/TIME/TIMESTAMP +/ INTERVAL math #[derive(Debug)] From f3554772a823dd42e8afae5cd33555f1be069293 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Fri, 7 Apr 2023 14:17:40 +0300 Subject: [PATCH 48/55] code simplification --- .../joins/symmetric_hash_join.rs | 10 +-- .../physical-expr/src/expressions/datetime.rs | 70 ++++--------------- 2 files changed, 18 insertions(+), 62 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index a2a578e68d68..fa40b420a567 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1740,6 +1740,7 @@ mod tests { schema: &Schema, ) -> Arc { match expr_id { + // constructs ((left_col - INTERVAL '100ms') > (right_col - INTERVAL '200ms')) AND ((left_col - INTERVAL '450ms') < (right_col - INTERVAL '300ms')) 0 => gen_conjunctive_temporal_expr( left_col, right_col, @@ -1753,6 +1754,7 @@ mod tests { ScalarValue::new_interval_dt(0, 300), // 300 ms schema, ), + // constructs ((left_col - TIMESTAMP '2023-01-01:12.00.03') > (right_col - TIMESTAMP '2023-01-01:12.00.01')) AND ((left_col - TIMESTAMP '2023-01-01:12.00.00') < (right_col - TIMESTAMP '2023-01-01:12.00.02')) 1 => gen_conjunctive_temporal_expr( left_col, right_col, @@ -2798,8 +2800,8 @@ mod tests { }, }]; let (left, right) = create_memory_table( - left_batch.clone(), - right_batch.clone(), + left_batch, + right_batch, Some(left_sorted), Some(right_sorted), 13, @@ -2872,8 +2874,8 @@ mod tests { }, }]; let (left, right) = create_memory_table( - left_batch.clone(), - right_batch.clone(), + left_batch, + right_batch, Some(left_sorted), Some(right_sorted), 13, diff --git a/datafusion/physical-expr/src/expressions/datetime.rs b/datafusion/physical-expr/src/expressions/datetime.rs index f412c2848d79..c2a54beceb1e 100644 --- a/datafusion/physical-expr/src/expressions/datetime.rs +++ b/datafusion/physical-expr/src/expressions/datetime.rs @@ -310,10 +310,7 @@ mod tests { use crate::execution_props::ExecutionProps; use arrow::array::{ArrayRef, Date32Builder}; use arrow::datatypes::*; - use arrow_array::{ - IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - }; + use arrow_array::IntervalMonthDayNanoArray; use chrono::{Duration, NaiveDate}; use datafusion_common::delta::shift_months; use datafusion_common::{Column, Result, ToDFSchema}; @@ -713,11 +710,12 @@ mod tests { // aiming that evaluate_temporal_array function is working properly and shows the same // behavior with ScalarValue arithmetic. fn experiment( - timestamp_array: ArrayRef, timestamp_scalar: ScalarValue, - interval_array: ArrayRef, interval_scalar: ScalarValue, ) -> Result<()> { + let timestamp_array = timestamp_scalar.to_array(); + let interval_array = interval_scalar.to_array(); + // timestamp + interval if let ColumnarValue::Array(res1) = evaluate_temporal_array(timestamp_array.clone(), 1, &interval_scalar)? @@ -783,9 +781,6 @@ mod tests { #[test] fn test_evalute_with_scalar() -> Result<()> { // Timestamp (sec) & Interval (DayTime) - let timestamp_array = Arc::new(TimestampSecondArray::from(vec![ - 1_672_531_200, // 2023-01-01:00.00.00, - ])); let timestamp_scalar = ScalarValue::TimestampSecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -796,20 +791,11 @@ mod tests { ), None, ); - let interval_array: ArrayRef = Arc::new(IntervalDayTimeArray::from(vec![1_000])); // 1 sec let interval_scalar = ScalarValue::new_interval_dt(0, 1_000); - experiment( - timestamp_array, - timestamp_scalar, - interval_array, - interval_scalar, - )?; + experiment(timestamp_scalar, interval_scalar)?; // Timestamp (millisec) & Interval (DayTime) - let timestamp_array = Arc::new(TimestampMillisecondArray::from(vec![ - 1_672_531_200_000, // 2023-01-01:00.00.00, - ])); let timestamp_scalar = ScalarValue::TimestampMillisecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -820,20 +806,11 @@ mod tests { ), None, ); - let interval_array: ArrayRef = Arc::new(IntervalDayTimeArray::from(vec![1_000])); // 1 sec let interval_scalar = ScalarValue::new_interval_dt(0, 1_000); - experiment( - timestamp_array, - timestamp_scalar, - interval_array, - interval_scalar, - )?; + experiment(timestamp_scalar, interval_scalar)?; // Timestamp (nanosec) & Interval (MonthDayNano) - let timestamp_array = Arc::new(TimestampNanosecondArray::from(vec![ - 1_672_531_200_000_000_000, // 2023-01-01:00.00.00, - ])); let timestamp_scalar = ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -844,21 +821,12 @@ mod tests { ), None, ); - let interval_array: ArrayRef = - Arc::new(IntervalMonthDayNanoArray::from(vec![1_000])); // 1 us let interval_scalar = ScalarValue::new_interval_mdn(0, 0, 1_000); - experiment( - timestamp_array, - timestamp_scalar, - interval_array, - interval_scalar, - )?; + experiment(timestamp_scalar, interval_scalar)?; // Timestamp (nanosec) & Interval (MonthDayNano), negatively resulting cases - let timestamp_array = Arc::new(TimestampNanosecondArray::from(vec![ - 0, // 2023-01-01:00.00.00, - ])); + let timestamp_scalar = ScalarValue::TimestampNanosecond( Some( NaiveDate::from_ymd_opt(1970, 1, 1) @@ -869,21 +837,13 @@ mod tests { ), None, ); - let interval_array: ArrayRef = - Arc::new(IntervalMonthDayNanoArray::from(vec![1_000])); // 1 us + + Arc::new(IntervalMonthDayNanoArray::from(vec![1_000])); // 1 us let interval_scalar = ScalarValue::new_interval_mdn(0, 0, 1_000); - experiment( - timestamp_array, - timestamp_scalar, - interval_array, - interval_scalar, - )?; + experiment(timestamp_scalar, interval_scalar)?; // Timestamp (sec) & Interval (YearMonth) - let timestamp_array = Arc::new(TimestampSecondArray::from(vec![ - 1_672_531_200, // 2023-01-01:00.00.00, - ])); let timestamp_scalar = ScalarValue::TimestampSecond( Some( NaiveDate::from_ymd_opt(2023, 1, 1) @@ -894,15 +854,9 @@ mod tests { ), None, ); - let interval_array: ArrayRef = Arc::new(IntervalYearMonthArray::from(vec![1])); // 1 us let interval_scalar = ScalarValue::new_interval_ym(0, 1); - experiment( - timestamp_array, - timestamp_scalar, - interval_array, - interval_scalar, - )?; + experiment(timestamp_scalar, interval_scalar)?; Ok(()) } From a44eb23218968dd5139d73219539de5f0766b4a4 Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Fri, 7 Apr 2023 18:05:03 +0300 Subject: [PATCH 49/55] Some simplifications --- .../joins/symmetric_hash_join.rs | 136 ++++++------------ .../physical-expr/src/aggregate/min_max.rs | 130 +++++++---------- 2 files changed, 98 insertions(+), 168 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index fa40b420a567..2462d85a9bb0 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -2771,88 +2771,32 @@ mod tests { assert_eq!(left_side_joiner.visited_rows.is_empty(), should_be_empty); Ok(()) } - - #[tokio::test(flavor = "multi_thread")] - async fn with_temporal_columns() -> Result<()> { - let join_type = JoinType::Full; - let config = SessionConfig::new().with_repartition_joins(false); - let session_ctx = SessionContext::with_config(config); - let task_ctx = session_ctx.task_ctx(); - let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; - let left_schema = &left_batch.schema(); - let right_schema = &right_batch.schema(); - let on = vec![( - Column::new_with_schema("lc1", left_schema)?, - Column::new_with_schema("rc1", right_schema)?, - )]; - let left_sorted = vec![PhysicalSortExpr { - expr: col("lt1", left_schema)?, - options: SortOptions { - descending: false, - nulls_first: true, - }, - }]; - let right_sorted = vec![PhysicalSortExpr { - expr: col("rt1", right_schema)?, - options: SortOptions { - descending: false, - nulls_first: true, - }, - }]; - let (left, right) = create_memory_table( - left_batch, - right_batch, - Some(left_sorted), - Some(right_sorted), - 13, - )?; - let intermediate_schema = Schema::new(vec![ - Field::new( - "left", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - Field::new( - "right", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - ]); - let filter_expr = join_expr_tests_fixture_temporal( - 0, - col("left", &intermediate_schema)?, - col("right", &intermediate_schema)?, - &intermediate_schema, - ); - let column_indices = vec![ - ColumnIndex { - index: 3, - side: JoinSide::Left, - }, - ColumnIndex { - index: 3, - side: JoinSide::Right, - }, - ]; - let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); - experiment( - left, - right, - Some(filter), - join_type, - on.clone(), - task_ctx.clone(), - ) - .await?; - Ok(()) - } + #[rstest] #[tokio::test(flavor = "multi_thread")] - async fn with_temporal_columns_2() -> Result<()> { - let join_type = JoinType::Full; + async fn testing_with_temporal_columns( + #[values( + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::RightSemi, + JoinType::LeftSemi, + JoinType::LeftAnti, + JoinType::RightAnti, + JoinType::Full + )] + join_type: JoinType, + #[values( + (4, 5), + (99, 12), + )] + cardinality: (i32, i32), + #[values(0, 1)] case_expr: usize, + ) -> Result<()> { let config = SessionConfig::new().with_repartition_joins(false); let session_ctx = SessionContext::with_config(config); let task_ctx = session_ctx.task_ctx(); - let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; + let (left_batch, right_batch) = + build_sides_record_batches(TABLE_SIZE, cardinality)?; let left_schema = &left_batch.schema(); let right_schema = &right_batch.schema(); let on = vec![( @@ -2893,7 +2837,7 @@ mod tests { ), ]); let filter_expr = join_expr_tests_fixture_temporal( - 1, + case_expr, col("left", &intermediate_schema)?, col("right", &intermediate_schema)?, &intermediate_schema, @@ -2909,24 +2853,34 @@ mod tests { }, ]; let filter = JoinFilter::new(filter_expr, column_indices, intermediate_schema); - experiment( - left, - right, - Some(filter), - join_type, - on.clone(), - task_ctx.clone(), - ) - .await?; + experiment(left, right, Some(filter), join_type, on, task_ctx).await?; Ok(()) } + #[rstest] #[tokio::test(flavor = "multi_thread")] - async fn with_temporal_columns_3() -> Result<()> { - let join_type = JoinType::Full; + async fn test_with_interval_columns( + #[values( + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::RightSemi, + JoinType::LeftSemi, + JoinType::LeftAnti, + JoinType::RightAnti, + JoinType::Full + )] + join_type: JoinType, + #[values( + (4, 5), + (99, 12), + )] + cardinality: (i32, i32), + ) -> Result<()> { let config = SessionConfig::new().with_repartition_joins(false); let session_ctx = SessionContext::with_config(config); let task_ctx = session_ctx.task_ctx(); - let (left_batch, right_batch) = build_sides_record_batches(TABLE_SIZE, (10, 10))?; + let (left_batch, right_batch) = + build_sides_record_batches(TABLE_SIZE, cardinality)?; let left_schema = &left_batch.schema(); let right_schema = &right_batch.schema(); let on = vec![( diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 58739af28de8..38374c2808ca 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -331,6 +331,22 @@ macro_rules! typed_min_max_string { }}; } +macro_rules! interval_min_max { + ($OP:expr, $LHS:expr, $RHS:expr) => {{ + match (stringify!($OP), $LHS.partial_cmp(&$RHS)) { + ("min", Some(std::cmp::Ordering::Greater)) + | ("max", Some(std::cmp::Ordering::Less)) => $RHS.clone(), + (_, Some(_)) => $LHS.clone(), + (_, _) => { + return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive {} operation", + stringify!($OP) + ))) + } + } + }}; +} + // min/max of two scalar values of the same type macro_rules! min_max { ($VALUE:expr, $DELTA:expr, $OP:ident) => {{ @@ -448,30 +464,10 @@ macro_rules! min_max { typed_min_max!(lhs, rhs, IntervalYearMonth, $OP) } ( - ScalarValue::IntervalYearMonth(lhs), - ScalarValue::IntervalDayTime(rhs), - ) => { - match (stringify!($OP) , ScalarValue::IntervalYearMonth(*lhs).partial_cmp(&ScalarValue::IntervalDayTime(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalDayTime(*rhs), - (_, Some(_)) => ScalarValue::IntervalYearMonth(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalYearMonth(*lhs), ScalarValue::IntervalDayTime(*rhs) - ))) - } - } - ( - ScalarValue::IntervalYearMonth(lhs), + ScalarValue::IntervalMonthDayNano(lhs), ScalarValue::IntervalMonthDayNano(rhs), ) => { - match (stringify!($OP) , ScalarValue::IntervalYearMonth(*lhs).partial_cmp(&ScalarValue::IntervalMonthDayNano(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalMonthDayNano(*rhs), - (_, Some(_)) => ScalarValue::IntervalYearMonth(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalYearMonth(*lhs), ScalarValue::IntervalMonthDayNano(*rhs) - ))) - } + typed_min_max!(lhs, rhs, IntervalMonthDayNano, $OP) } ( ScalarValue::IntervalDayTime(lhs), @@ -480,62 +476,25 @@ macro_rules! min_max { typed_min_max!(lhs, rhs, IntervalDayTime, $OP) } ( - ScalarValue::IntervalDayTime(lhs), - ScalarValue::IntervalYearMonth(rhs), - ) => { - match (stringify!($OP) , ScalarValue::IntervalDayTime(*lhs).partial_cmp(&ScalarValue::IntervalYearMonth(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalYearMonth(*rhs), - (_, Some(_)) => ScalarValue::IntervalDayTime(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalDayTime(*lhs), ScalarValue::IntervalYearMonth(*rhs) - ))) - } - } - ( - ScalarValue::IntervalDayTime(lhs), - ScalarValue::IntervalMonthDayNano(rhs), - ) => { - match (stringify!($OP) , ScalarValue::IntervalDayTime(*lhs).partial_cmp(&ScalarValue::IntervalMonthDayNano(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalMonthDayNano(*rhs), - (_, Some(_)) => ScalarValue::IntervalDayTime(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalDayTime(*lhs), ScalarValue::IntervalMonthDayNano(*rhs) - ))) - } - } - ( - ScalarValue::IntervalMonthDayNano(lhs), - ScalarValue::IntervalMonthDayNano(rhs), - ) => { - typed_min_max!(lhs, rhs, IntervalMonthDayNano, $OP) - } - ( - ScalarValue::IntervalMonthDayNano(lhs), - ScalarValue::IntervalYearMonth(rhs), - ) => { - match (stringify!($OP) , ScalarValue::IntervalMonthDayNano(*lhs).partial_cmp(&ScalarValue::IntervalYearMonth(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalYearMonth(*rhs), - (_, Some(_)) => ScalarValue::IntervalMonthDayNano(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalMonthDayNano(*lhs), ScalarValue::IntervalYearMonth(*rhs) - ))) - } - } - ( - ScalarValue::IntervalMonthDayNano(lhs), - ScalarValue::IntervalDayTime(rhs), + ScalarValue::IntervalYearMonth(_), + ScalarValue::IntervalMonthDayNano(_), + ) | ( + ScalarValue::IntervalYearMonth(_), + ScalarValue::IntervalDayTime(_), + ) | ( + ScalarValue::IntervalMonthDayNano(_), + ScalarValue::IntervalDayTime(_), + ) | ( + ScalarValue::IntervalMonthDayNano(_), + ScalarValue::IntervalYearMonth(_), + ) | ( + ScalarValue::IntervalDayTime(_), + ScalarValue::IntervalYearMonth(_), + ) | ( + ScalarValue::IntervalDayTime(_), + ScalarValue::IntervalMonthDayNano(_), ) => { - match (stringify!($OP) , ScalarValue::IntervalMonthDayNano(*lhs).partial_cmp(&ScalarValue::IntervalDayTime(*rhs))) { - ("min", Some(std::cmp::Ordering::Greater)) | ("max", Some(std::cmp::Ordering::Less)) => ScalarValue::IntervalDayTime(*rhs), - (_, Some(_)) => ScalarValue::IntervalMonthDayNano(*lhs), - (_,_) => return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive scalars of incompatible types {:?} - {:?}", - ScalarValue::IntervalMonthDayNano(*lhs), ScalarValue::IntervalDayTime(*rhs) - ))) - } + interval_min_max!($OP, $VALUE, $DELTA) } e => { return Err(DataFusionError::Internal(format!( @@ -1395,4 +1354,21 @@ mod tests { ScalarValue::Time64Nanosecond(Some(5)) ) } + + #[test] + fn sa() -> Result<()> { + let sa = ScalarValue::Float64(Some(3.1)); + let ass = ScalarValue::Float64(Some(3.1)); + + let mete = &sa; + let metee = &ass; + + let saaa = match mete.partial_cmp(metee) { + Some(std::cmp::Ordering::Greater) => mete.clone(), + Some(_) => metee.clone(), + _ => unreachable!(), + }; + println!("{:?}", saaa); + Ok(()) + } } From 9b623cd59bb80122b656b40cb59fa9e0e96fc27c Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Fri, 7 Apr 2023 20:31:49 +0300 Subject: [PATCH 50/55] Update min_max.rs --- .../physical-expr/src/aggregate/min_max.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 38374c2808ca..311835f3bd08 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -1354,21 +1354,4 @@ mod tests { ScalarValue::Time64Nanosecond(Some(5)) ) } - - #[test] - fn sa() -> Result<()> { - let sa = ScalarValue::Float64(Some(3.1)); - let ass = ScalarValue::Float64(Some(3.1)); - - let mete = &sa; - let metee = &ass; - - let saaa = match mete.partial_cmp(metee) { - Some(std::cmp::Ordering::Greater) => mete.clone(), - Some(_) => metee.clone(), - _ => unreachable!(), - }; - println!("{:?}", saaa); - Ok(()) - } } From 3f92e563f76641882fde7faf154ea796851a02b2 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 10 Apr 2023 11:09:11 +0300 Subject: [PATCH 51/55] arithmetics moved into macros --- datafusion-cli/Cargo.lock | 30 +- .../physical-expr/src/expressions/binary.rs | 353 +++++++++++------- 2 files changed, 236 insertions(+), 147 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 55f37b1d15d8..f1ed5fd27fc1 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -455,9 +455,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa48fa079165080f11d7753fd0bc175b7d391f276b965fe4b55bfad67856e463" +checksum = "cf9cc2b23599e6d7479755f3594285efb3f74a1bdca7a7374948bc831e23a552" dependencies = [ "chrono", "chrono-tz-build", @@ -957,13 +957,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d6a0976c999d473fe89ad888d5a284e55366d9dc9038b1ba2aa15128c4afa0" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -997,13 +997,13 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.11" +version = "3.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9799aefb4a2e4a01cc47610b1dd47c18ab13d991f27bbcaed9296f5a53d5cbad" +checksum = "39ae6b3d9530211fb3b12a95374b8b0823be812f53d09e18c5675c0146b09642" dependencies = [ "cfg-if", "rustix", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -1148,9 +1148,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", @@ -2148,16 +2148,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.7" +version = "0.37.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aae838e49b3d63e9274e1c01833cc8139d3fec468c3b84688c628f44b1ae11d" +checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -2730,9 +2730,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" dependencies = [ "getrandom", ] diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index d30f219bd961..6c709891fe18 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1244,6 +1244,24 @@ pub fn binary( Ok(Arc::new(BinaryExpr::new(lhs, op, rhs))) } +macro_rules! sub_timestamp_macro { + ($array:expr, $rhs:expr, $caster:expr, $interval_type:ty, $opt_tz_lhs:expr, $multiplier:expr, + $opt_tz_rhs:expr, $unit_sub:expr, $naive_sub_fn:expr, $counter:expr) => {{ + let prim_array = $caster(&$array)?; + let ret: PrimitiveArray<$interval_type> = try_unary(prim_array, |lhs| { + let (parsed_lhs_tz, parsed_rhs_tz) = + (parse_timezones($opt_tz_lhs)?, parse_timezones($opt_tz_rhs)?); + let (naive_lhs, naive_rhs) = calculate_naives::<$unit_sub>( + lhs.mul_wrapping($multiplier), + parsed_lhs_tz, + $rhs.mul_wrapping($multiplier), + parsed_rhs_tz, + )?; + Ok($naive_sub_fn($counter(&naive_lhs), $counter(&naive_rhs))) + })?; + Arc::new(ret) as ArrayRef + }}; +} /// This function handles the Timestamp - Timestamp operations, /// where the first one is an array, and the second one is a scalar, /// hence the result is also an array. @@ -1253,89 +1271,69 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result { - let prim_array = as_timestamp_second_array(&array)?; - let ret: PrimitiveArray = - try_unary(prim_array, |lhs| { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); - let (naive_lhs, naive_rhs) = calculate_naives::( - lhs.mul_wrapping(1000), - parsed_lhs_tz, - rhs.mul_wrapping(1000), - parsed_rhs_tz, - )?; - Ok(seconds_sub( - NaiveDateTime::timestamp(&naive_lhs), - NaiveDateTime::timestamp(&naive_rhs), - )) - })?; - Arc::new(ret) as ArrayRef + sub_timestamp_macro!( + array, + rhs, + as_timestamp_second_array, + IntervalDayTimeType, + opt_tz_lhs, + 1000, + opt_tz_rhs, + MILLISECOND_MODE, + seconds_sub, + NaiveDateTime::timestamp + ) } ( DataType::Timestamp(TimeUnit::Millisecond, opt_tz_lhs), ScalarValue::TimestampMillisecond(Some(rhs), opt_tz_rhs), ) => { - let prim_array = as_timestamp_millisecond_array(&array)?; - let ret: PrimitiveArray = - try_unary(prim_array, |lhs| { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); - let (naive_lhs, naive_rhs) = calculate_naives::( - lhs, - parsed_lhs_tz, - *rhs, - parsed_rhs_tz, - )?; - Ok(milliseconds_sub( - NaiveDateTime::timestamp_millis(&naive_lhs), - NaiveDateTime::timestamp_millis(&naive_rhs), - )) - })?; - Arc::new(ret) as ArrayRef + sub_timestamp_macro!( + array, + rhs, + as_timestamp_millisecond_array, + IntervalDayTimeType, + opt_tz_lhs, + 1, + opt_tz_rhs, + MILLISECOND_MODE, + milliseconds_sub, + NaiveDateTime::timestamp_millis + ) } ( DataType::Timestamp(TimeUnit::Microsecond, opt_tz_lhs), ScalarValue::TimestampMicrosecond(Some(rhs), opt_tz_rhs), ) => { - let prim_array = as_timestamp_microsecond_array(&array)?; - let ret: PrimitiveArray = - try_unary(prim_array, |lhs| { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); - let (naive_lhs, naive_rhs) = calculate_naives::( - lhs.mul_wrapping(1000), - parsed_lhs_tz, - rhs.mul_wrapping(1000), - parsed_rhs_tz, - )?; - Ok(microseconds_sub( - NaiveDateTime::timestamp_micros(&naive_lhs), - NaiveDateTime::timestamp_micros(&naive_rhs), - )) - })?; - Arc::new(ret) as ArrayRef + sub_timestamp_macro!( + array, + rhs, + as_timestamp_microsecond_array, + IntervalMonthDayNanoType, + opt_tz_lhs, + 1000, + opt_tz_rhs, + NANOSECOND_MODE, + microseconds_sub, + NaiveDateTime::timestamp_micros + ) } ( DataType::Timestamp(TimeUnit::Nanosecond, opt_tz_lhs), ScalarValue::TimestampNanosecond(Some(rhs), opt_tz_rhs), ) => { - let prim_array = as_timestamp_nanosecond_array(&array)?; - let ret: PrimitiveArray = - try_unary(prim_array, |lhs| { - let (parsed_lhs_tz, parsed_rhs_tz) = - (parse_timezones(opt_tz_lhs)?, parse_timezones(opt_tz_rhs)?); - let (naive_lhs, naive_rhs) = calculate_naives::( - lhs, - parsed_lhs_tz, - *rhs, - parsed_rhs_tz, - )?; - Ok(nanoseconds_sub( - NaiveDateTime::timestamp_nanos(&naive_lhs), - NaiveDateTime::timestamp_nanos(&naive_rhs), - )) - })?; - Arc::new(ret) as ArrayRef + sub_timestamp_macro!( + array, + rhs, + as_timestamp_nanosecond_array, + IntervalMonthDayNanoType, + opt_tz_lhs, + 1, + opt_tz_rhs, + NANOSECOND_MODE, + nanoseconds_sub, + NaiveDateTime::timestamp_nanos + ) } (_, _) => { return Err(DataFusionError::Internal(format!( @@ -1347,6 +1345,17 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result {{ + let array = $as_timestamp(&$array)?; + let ret: PrimitiveArray<$ts_type> = + try_unary::<$ts_type, _, $ts_type>(array, |ts_s| { + Ok($fn_op(ts_s, $scalar, $sign)?) + })?; + Arc::new(ret.with_timezone_opt($tz.clone())) as ArrayRef + }}; +} /// This function handles the Timestamp - Interval operations, /// where the first one is an array, and the second one is a scalar, /// hence the result is also an array. @@ -1357,40 +1366,48 @@ pub fn ts_scalar_interval_op( ) -> Result { let ret = match array.data_type() { DataType::Timestamp(TimeUnit::Second, tz) => { - let array = as_timestamp_second_array(&array)?; - let ret: PrimitiveArray = - try_unary::( - array, - |ts_s| Ok(seconds_add(ts_s, scalar, sign)?), - )?; - Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + sub_timestamp_interval_macro!( + array, + as_timestamp_second_array, + TimestampSecondType, + seconds_add, + scalar, + sign, + tz + ) } DataType::Timestamp(TimeUnit::Millisecond, tz) => { - let array = as_timestamp_millisecond_array(&array)?; - let ret: PrimitiveArray = - try_unary::( - array, - |ts_ms| Ok(milliseconds_add(ts_ms, scalar, sign)?), - )?; - Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + sub_timestamp_interval_macro!( + array, + as_timestamp_millisecond_array, + TimestampMillisecondType, + milliseconds_add, + scalar, + sign, + tz + ) } DataType::Timestamp(TimeUnit::Microsecond, tz) => { - let array = as_timestamp_microsecond_array(&array)?; - let ret: PrimitiveArray = - try_unary::( - array, - |ts_us| Ok(microseconds_add(ts_us, scalar, sign)?), - )?; - Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + sub_timestamp_interval_macro!( + array, + as_timestamp_microsecond_array, + TimestampMicrosecondType, + microseconds_add, + scalar, + sign, + tz + ) } DataType::Timestamp(TimeUnit::Nanosecond, tz) => { - let array = as_timestamp_nanosecond_array(&array)?; - let ret: PrimitiveArray = - try_unary::( - array, - |ts_ns| Ok(nanoseconds_add(ts_ns, scalar, sign)?), - )?; - Arc::new(ret.with_timezone_opt(tz.clone())) as ArrayRef + sub_timestamp_interval_macro!( + array, + as_timestamp_nanosecond_array, + TimestampNanosecondType, + nanoseconds_add, + scalar, + sign, + tz + ) } _ => Err(DataFusionError::Internal(format!( "Invalid lhs type for Timestamp vs Interval operations: {}", @@ -1399,6 +1416,30 @@ pub fn ts_scalar_interval_op( }; Ok(ColumnarValue::Array(ret)) } + +macro_rules! sub_interval_macro { + ($array:expr, $as_interval:expr, $interval_type:ty, $fn_op:expr, $scalar:expr, $sign:expr) => {{ + let array = $as_interval(&$array)?; + let ret: PrimitiveArray<$interval_type> = + unary(array, |lhs| $fn_op(lhs, *$scalar, $sign)); + Arc::new(ret) as ArrayRef + }}; +} +macro_rules! sub_interval_cross_macro { + ($array:expr, $as_interval:expr, $commute:expr, $fn_op:expr, $scalar:expr, $sign:expr, $t1:ty, $t2:ty) => {{ + let array = $as_interval(&$array)?; + let ret: PrimitiveArray = if $commute { + unary(array, |lhs| { + $fn_op(*$scalar as $t1, lhs as $t2, $sign, $commute) + }) + } else { + unary(array, |lhs| { + $fn_op(lhs as $t1, *$scalar as $t2, $sign, $commute) + }) + }; + Arc::new(ret) as ArrayRef + }}; +} /// This function handles the Interval - Interval operations, /// where the first one is an array, and the second one is a scalar, /// hence the result is also an interval array. @@ -1412,82 +1453,130 @@ pub fn interval_scalar_interval_op( DataType::Interval(IntervalUnit::YearMonth), ScalarValue::IntervalYearMonth(Some(rhs)), ) => { - let array = as_interval_ym_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_ym(lhs, *rhs, sign)); - Arc::new(ret) as ArrayRef + sub_interval_macro!( + array, + as_interval_ym_array, + IntervalYearMonthType, + op_ym, + rhs, + sign + ) } ( DataType::Interval(IntervalUnit::YearMonth), ScalarValue::IntervalDayTime(Some(rhs)), ) => { - let array = as_interval_ym_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_ym_dt(lhs, *rhs, sign, false)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_ym_array, + false, + op_ym_dt, + rhs, + sign, + i32, + i64 + ) } ( DataType::Interval(IntervalUnit::YearMonth), ScalarValue::IntervalMonthDayNano(Some(rhs)), ) => { - let array = as_interval_ym_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_ym_mdn(lhs, *rhs, sign, false)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_ym_array, + false, + op_ym_mdn, + rhs, + sign, + i32, + i128 + ) } ( DataType::Interval(IntervalUnit::DayTime), ScalarValue::IntervalYearMonth(Some(rhs)), ) => { - let array = as_interval_dt_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_ym_dt(*rhs, lhs, sign, true)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_dt_array, + true, + op_ym_dt, + rhs, + sign, + i32, + i64 + ) } ( DataType::Interval(IntervalUnit::DayTime), ScalarValue::IntervalDayTime(Some(rhs)), ) => { - let array = as_interval_dt_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_dt(lhs, *rhs, sign)); - Arc::new(ret) as ArrayRef + sub_interval_macro!( + array, + as_interval_dt_array, + IntervalDayTimeType, + op_dt, + rhs, + sign + ) } ( DataType::Interval(IntervalUnit::DayTime), ScalarValue::IntervalMonthDayNano(Some(rhs)), ) => { - let array = as_interval_dt_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_dt_mdn(lhs, *rhs, sign, false)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_dt_array, + false, + op_dt_mdn, + rhs, + sign, + i64, + i128 + ) } ( DataType::Interval(IntervalUnit::MonthDayNano), ScalarValue::IntervalYearMonth(Some(rhs)), ) => { - let array = as_interval_mdn_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_ym_mdn(*rhs, lhs, sign, true)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_mdn_array, + true, + op_ym_mdn, + rhs, + sign, + i32, + i128 + ) } ( DataType::Interval(IntervalUnit::MonthDayNano), ScalarValue::IntervalDayTime(Some(rhs)), ) => { - let array = as_interval_mdn_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_dt_mdn(*rhs, lhs, sign, true)); - Arc::new(ret) as ArrayRef + sub_interval_cross_macro!( + array, + as_interval_mdn_array, + true, + op_dt_mdn, + rhs, + sign, + i64, + i128 + ) } ( DataType::Interval(IntervalUnit::MonthDayNano), ScalarValue::IntervalMonthDayNano(Some(rhs)), ) => { - let array = as_interval_mdn_array(&array)?; - let ret: PrimitiveArray = - unary(array, |lhs| op_mdn(lhs, *rhs, sign)); - Arc::new(ret) as ArrayRef + sub_interval_macro!( + array, + as_interval_mdn_array, + IntervalMonthDayNanoType, + op_mdn, + rhs, + sign + ) } _ => Err(DataFusionError::Internal(format!( "Invalid operands for Interval vs Interval operations: {} - {}", From 98f4326525ad58da418f907b1878653bab519789 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 10 Apr 2023 17:54:26 +0300 Subject: [PATCH 52/55] fix cargo.lock --- datafusion-cli/Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index d0e23b76266e..2c79d7375200 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -963,7 +963,7 @@ checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -2148,9 +2148,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.8" +version = "0.37.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aef160324be24d31a62147fae491c14d2204a3865c7ca8c3b0d7f7bcb3ea635" +checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" dependencies = [ "bitflags", "errno", From 9ec794df40c31178708770656d0485e4ccb7e0fc Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 11 Apr 2023 13:10:29 +0300 Subject: [PATCH 53/55] remove unwraps from tests --- .../joins/symmetric_hash_join.rs | 6 +- .../physical-expr/src/intervals/test_utils.rs | 60 ++++++++++--------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 2462d85a9bb0..cfd8e0a7fe7b 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -1738,7 +1738,7 @@ mod tests { left_col: Arc, right_col: Arc, schema: &Schema, - ) -> Arc { + ) -> Result> { match expr_id { // constructs ((left_col - INTERVAL '100ms') > (right_col - INTERVAL '200ms')) AND ((left_col - INTERVAL '450ms') < (right_col - INTERVAL '300ms')) 0 => gen_conjunctive_temporal_expr( @@ -2841,7 +2841,7 @@ mod tests { col("left", &intermediate_schema)?, col("right", &intermediate_schema)?, &intermediate_schema, - ); + )?; let column_indices = vec![ ColumnIndex { index: 3, @@ -2917,7 +2917,7 @@ mod tests { col("left", &intermediate_schema)?, col("right", &intermediate_schema)?, &intermediate_schema, - ); + )?; let column_indices = vec![ ColumnIndex { index: 9, diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs index 7219b474efac..3070cb0a8fce 100644 --- a/datafusion/physical-expr/src/intervals/test_utils.rs +++ b/datafusion/physical-expr/src/intervals/test_utils.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use crate::expressions::{BinaryExpr, DateTimeIntervalExpr, Literal}; use crate::PhysicalExpr; use arrow_schema::Schema; -use datafusion_common::ScalarValue; +use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::Operator; #[allow(clippy::too_many_arguments)] @@ -83,34 +83,36 @@ pub fn gen_conjunctive_temporal_expr( c: ScalarValue, d: ScalarValue, schema: &Schema, -) -> Arc { - let left_and_1 = Arc::new( - DateTimeIntervalExpr::try_new( - left_col.clone(), - op_1, - Arc::new(Literal::new(a)), - schema, - ) - .unwrap(), - ); - let left_and_2 = Arc::new( - DateTimeIntervalExpr::try_new( - right_col.clone(), - op_2, - Arc::new(Literal::new(b)), - schema, - ) - .unwrap(), - ); - let right_and_1 = Arc::new( - DateTimeIntervalExpr::try_new(left_col, op_3, Arc::new(Literal::new(c)), schema) - .unwrap(), - ); - let right_and_2 = Arc::new( - DateTimeIntervalExpr::try_new(right_col, op_4, Arc::new(Literal::new(d)), schema) - .unwrap(), - ); +) -> Result, DataFusionError> { + let left_and_1 = Arc::new(DateTimeIntervalExpr::try_new( + left_col.clone(), + op_1, + Arc::new(Literal::new(a)), + schema, + )?); + let left_and_2 = Arc::new(DateTimeIntervalExpr::try_new( + right_col.clone(), + op_2, + Arc::new(Literal::new(b)), + schema, + )?); + let right_and_1 = Arc::new(DateTimeIntervalExpr::try_new( + left_col, + op_3, + Arc::new(Literal::new(c)), + schema, + )?); + let right_and_2 = Arc::new(DateTimeIntervalExpr::try_new( + right_col, + op_4, + Arc::new(Literal::new(d)), + schema, + )?); let left_expr = Arc::new(BinaryExpr::new(left_and_1, Operator::Gt, left_and_2)); let right_expr = Arc::new(BinaryExpr::new(right_and_1, Operator::Lt, right_and_2)); - Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr)) + Ok(Arc::new(BinaryExpr::new( + left_expr, + Operator::And, + right_expr, + ))) } From ff56e1734e80b2f7242e890d302262db2b032460 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Tue, 11 Apr 2023 16:52:36 -0500 Subject: [PATCH 54/55] Remove run-time string comparison from the interval min/max macro --- .../physical-expr/src/aggregate/min_max.rs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 311835f3bd08..589501a017a7 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -331,17 +331,24 @@ macro_rules! typed_min_max_string { }}; } +macro_rules! interval_choose_min_max { + (min) => { + std::cmp::Ordering::Greater + }; + (max) => { + std::cmp::Ordering::Less + }; +} + macro_rules! interval_min_max { - ($OP:expr, $LHS:expr, $RHS:expr) => {{ - match (stringify!($OP), $LHS.partial_cmp(&$RHS)) { - ("min", Some(std::cmp::Ordering::Greater)) - | ("max", Some(std::cmp::Ordering::Less)) => $RHS.clone(), - (_, Some(_)) => $LHS.clone(), - (_, _) => { - return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive {} operation", - stringify!($OP) - ))) + ($OP:tt, $LHS:expr, $RHS:expr) => {{ + match $LHS.partial_cmp(&$RHS) { + Some(interval_choose_min_max!($OP)) => $RHS.clone(), + Some(_) => $LHS.clone(), + None => { + return Err(DataFusionError::Internal( + "Comparison error while computing interval min/max".to_string(), + )) } } }}; From 7ca4f0b9c6f0657be90c0d4bfd430dd6fd16e1bd Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Wed, 12 Apr 2023 10:19:45 +0300 Subject: [PATCH 55/55] adapt upstream changes of timezone signature --- .../physical-expr/src/expressions/binary.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index c83ca625e5fc..420abe313d92 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1279,9 +1279,9 @@ pub fn ts_scalar_ts_op(array: ArrayRef, scalar: &ScalarValue) -> Result Result Result Result