Skip to content

Commit

Permalink
Support parsing unpadded dates
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Jul 14, 2023
1 parent 5bb363d commit c1aa19a
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 7 deletions.
4 changes: 4 additions & 0 deletions arrow-cast/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ harness = false
name = "parse_time"
harness = false

[[bench]]
name = "parse_date"
harness = false

[[bench]]
name = "parse_decimal"
harness = false
34 changes: 34 additions & 0 deletions arrow-cast/benches/parse_date.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::types::Date32Type;
use arrow_cast::parse::Parser;
use criterion::*;

fn criterion_benchmark(c: &mut Criterion) {
let timestamps = ["2020-09-08", "2020-9-8", "2020-09-8", "2020-9-08"];

for timestamp in timestamps {
let t = black_box(timestamp);
c.bench_function(t, |b| {
b.iter(|| Date32Type::parse(t).unwrap());
});
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
13 changes: 9 additions & 4 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7420,7 +7420,7 @@ mod tests {

let a = StringArray::from(vec![
"2000-01-01", // valid date with leading 0s
"2000-2-2", // invalid date without leading 0s
"2000-2-2", // valid date without leading 0s
"2000-00-00", // invalid month and day
"2000-01-01T12:00:00", // date + time is invalid
"2000", // just a year is invalid
Expand All @@ -7438,12 +7438,17 @@ mod tests {
assert!(c.is_valid(0)); // "2000-01-01"
assert_eq!(date_value, c.value(0));

assert!(!c.is_valid(1)); // "2000-2-2"
let date_value = since(
NaiveDate::from_ymd_opt(2000, 2, 2).unwrap(),
from_ymd(1970, 1, 1).unwrap(),
)
.num_days() as i32;
assert!(c.is_valid(1)); // "2000-2-2"
assert_eq!(date_value, c.value(1));

// test invalid inputs
assert!(!c.is_valid(2)); // "2000-00-00"
assert!(c.is_valid(3)); // "2000-01-01T12:00:00"
assert_eq!(date_value, c.value(3));
assert!(!c.is_valid(3)); // "2000-01-01T12:00:00"
assert!(!c.is_valid(4)); // "2000"
}

Expand Down
83 changes: 80 additions & 3 deletions arrow-cast/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ fn parse_nanos<const N: usize, const O: u8>(digits: &[u8]) -> u32 {
* 10_u32.pow((9 - N) as _)
}

/// Helper for parsing timestamps
/// Helper for parsing RFC3339 timestamps
struct TimestampParser {
/// The timestamp bytes to parse minus `b'0'`
///
Expand Down Expand Up @@ -579,10 +579,64 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// Error message if nanosecond conversion request beyond supported interval
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804";

fn parse_date(string: &str) -> Option<NaiveDate> {
if string.len() > 10 {
return None;
}
let mut digits = [0; 10];
let mut mask = 0;

// Treating all bytes the same way, helps LLVM vectorise this correctly
for (idx, (o, i)) in digits.iter_mut().zip(string.bytes()).enumerate() {
*o = i.wrapping_sub(b'0');
mask |= ((*o < 10) as u16) << idx
}

const HYPHEN: u8 = b'-'.wrapping_sub(b'0');

if digits[4] != HYPHEN {
return None;
}

let (month, day) = match mask {
0b1101101111 => {
if digits[7] != HYPHEN {
return None;
}
(digits[5] * 10 + digits[6], digits[8] * 10 + digits[9])
}
0b101101111 => {
if digits[7] != HYPHEN {
return None;
}
(digits[5] * 10 + digits[6], digits[8])
}
0b110101111 => {
if digits[6] != HYPHEN {
return None;
}
(digits[5], digits[7] * 10 + digits[8])
}
0b10101111 => {
if digits[6] != HYPHEN {
return None;
}
(digits[5], digits[7])
}
_ => return None,
};

let year = digits[0] as u16 * 1000
+ digits[1] as u16 * 100
+ digits[2] as u16 * 10
+ digits[3] as u16;

NaiveDate::from_ymd_opt(year as _, month as _, day as _)
}

impl Parser for Date32Type {
fn parse(string: &str) -> Option<i32> {
let parser = TimestampParser::new(string.as_bytes());
let date = parser.date()?;
let date = parse_date(string)?;
Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
}

Expand All @@ -594,9 +648,14 @@ impl Parser for Date32Type {

impl Parser for Date64Type {
fn parse(string: &str) -> Option<i64> {
if string.len() < 10 {
let date = parse_date(string)?;
Some(NaiveDateTime::new(date, NaiveTime::default()).timestamp_millis())
} else {
let date_time = string_to_datetime(&Utc, string).ok()?;
Some(date_time.timestamp_millis())
}
}

fn parse_formatted(string: &str, format: &str) -> Option<i64> {
use chrono::format::Fixed;
Expand Down Expand Up @@ -1198,6 +1257,7 @@ fn parse_interval_components(
#[cfg(test)]
mod tests {
use super::*;
use arrow_array::temporal_conversions::date32_to_datetime;
use arrow_array::timezone::Tz;
use arrow_buffer::i256;

Expand Down Expand Up @@ -1467,6 +1527,23 @@ mod tests {
assert_ne!(dt, date.naive_utc());
}

#[test]
fn parse_date32() {
let cases = [
"2020-09-08",
"2020-9-8",
"2020-09-8",
"2020-9-08",
"2020-12-1",
"1690-2-5",
];
for case in cases {
let v = date32_to_datetime(Date32Type::parse(case).unwrap()).unwrap();
let expected: NaiveDate = case.parse().unwrap();
assert_eq!(v.date(), expected);
}
}

#[test]
fn parse_time64_nanos() {
assert_eq!(
Expand Down

0 comments on commit c1aa19a

Please sign in to comment.