diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 2767cd6d93..5f40339b7b 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -4,7 +4,7 @@ use rustc_hash::FxHashMap; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; -use crate::schema::Type; +use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( ctx.path_to_unordered_id .get_or_allocate_unordered_id(json_path_writer.as_str()), ); + let val = val.truncate(DATE_TIME_PRECISION_INDEXED); term_buffer.append_type_and_fast_value(val); postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } @@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( /// Tries to infer a JSON type from a string and append it to the term. /// /// The term must be json + JSON path. -pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option { +pub fn convert_to_fast_value_and_append_to_json_term( + mut term: Term, + phrase: &str, + truncate_date_for_search: bool, +) -> Option { assert_eq!( term.value() .as_json_value_bytes() @@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st "JSON value bytes should be empty" ); if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let dt_utc = dt.to_offset(UtcOffset::UTC); - term.append_type_and_fast_value(DateTime::from_utc(dt_utc)); + let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); + if truncate_date_for_search { + dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + } + term.append_type_and_fast_value(dt); return Some(term); } if let Ok(i64_val) = str::parse::(phrase) { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 63ec869aa3..1af64607bd 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -673,7 +673,7 @@ mod tests { ] ); assert_eq!( - get_doc_ids(vec![Term::from_field_date( + get_doc_ids(vec![Term::from_field_date_for_search( date_field, DateTime::from_utc(curr_time) )])?, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 439f46aee8..1ab320006d 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -64,9 +64,9 @@ impl SegmentWriter { /// /// The arguments are defined as follows /// - /// - memory_budget: most of the segment writer data (terms, and postings lists recorders) - /// is stored in a memory arena. This makes it possible for the user to define - /// the flushing behavior as a memory limit. + /// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is + /// stored in a memory arena. This makes it possible for the user to define the flushing + /// behavior as a memory limit. /// - segment: The segment being written /// - schema pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result { @@ -431,7 +431,7 @@ mod tests { use crate::query::{PhraseQuery, QueryParser}; use crate::schema::{ Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value, - STORED, STRING, TEXT, + DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT, }; use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::time::format_description::well_known::Rfc3339; @@ -651,7 +651,8 @@ mod tests { set_fast_val( DateTime::from_utc( OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(), - ), + ) + .truncate(DATE_TIME_PRECISION_INDEXED), term ) .serialized_value_bytes() diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 043d081df4..71c34cecda 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -241,7 +241,7 @@ impl MoreLikeThis { let timestamp = value.as_datetime().ok_or_else(|| { TantivyError::InvalidArgument("invalid value".to_string()) })?; - let term = Term::from_field_date(field, timestamp); + let term = Term::from_field_date_for_search(field, timestamp); *term_frequencies.entry(term).or_insert(0) += 1; } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 9c98e7c7bf..337e6c1deb 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// so-called default fields (as set up in the constructor). /// /// Assuming that the default fields are `body` and `title`, and the query parser is set with -/// conjunction as a default, our query will be interpreted as. +/// conjunction as a default, our query will be interpreted as. /// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`. /// By default, all tokenized and indexed fields are default fields. /// @@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// `body:Barack OR (body:Barack OR text:Obama)` . /// /// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is -/// interpreted -/// as `(a AND b) OR c`. +/// interpreted as `(a AND b) OR c`. /// /// * In addition to the boolean operators, the `-`, `+` can help define. These operators are /// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be @@ -272,8 +271,7 @@ impl QueryParser { /// Creates a `QueryParser`, given /// * an index - /// * a set of default fields used to search if no field is specifically defined - /// in the query. + /// * a set of default fields used to search if no field is specifically defined in the query. pub fn for_index(index: &Index, default_fields: Vec) -> QueryParser { QueryParser::new(index.schema(), default_fields, index.tokenizers().clone()) } @@ -500,6 +498,7 @@ impl QueryParser { convert_to_fast_value_and_append_to_json_term( get_term_with_path(), phrase, + false, ) { Ok(term) @@ -569,7 +568,7 @@ impl QueryParser { } FieldType::Date(_) => { let dt = OffsetDateTime::parse(phrase, &Rfc3339)?; - let dt_term = Term::from_field_date(field, DateTime::from_utc(dt)); + let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt)); Ok(vec![LogicalLiteral::Term(dt_term)]) } FieldType::Str(ref str_options) => { @@ -701,8 +700,8 @@ impl QueryParser { /// /// The terms are identified by a triplet: /// - tantivy field - /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON - /// object by naturally extending the json field name with a "." separated field_path + /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by + /// naturally extending the json field name with a "." separated field_path /// - field_phrase: the phrase that is being searched. /// /// The literal identifies the targeted field by a so-called *full field path*, @@ -965,7 +964,8 @@ fn generate_literals_for_json_object( || Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled()); // Try to convert the phrase to a fast value - if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase) + if let Some(term) = + convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true) { logical_literals.push(LogicalLiteral::Term(term)); } diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index d492462709..7f9e11ca0a 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -116,10 +116,7 @@ impl Query for RangeQuery { let field_type = schema.get_field_entry(self.field()).field_type(); if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) { - Ok(Box::new(FastFieldRangeWeight::new( - self.field(), - self.bounds.clone(), - ))) + Ok(Box::new(FastFieldRangeWeight::new(self.bounds.clone()))) } else { if field_type.is_json() { return Err(crate::TantivyError::InvalidArgument( diff --git a/src/query/range_query/range_query_u64_fastfield.rs b/src/query/range_query/range_query_u64_fastfield.rs index b7b3e484ad..40ed51f592 100644 --- a/src/query/range_query/range_query_u64_fastfield.rs +++ b/src/query/range_query/range_query_u64_fastfield.rs @@ -10,24 +10,22 @@ use columnar::{ StrColumn, }; use common::bounds::{BoundsRange, TransformBound}; -use common::BinarySerializable; use super::fast_field_range_doc_set::RangeDocSet; use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight}; -use crate::schema::{Field, Type, ValueBytes}; +use crate::schema::{Type, ValueBytes}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; /// `FastFieldRangeWeight` uses the fast field to execute range queries. #[derive(Clone, Debug)] pub struct FastFieldRangeWeight { bounds: BoundsRange, - field: Field, } impl FastFieldRangeWeight { /// Create a new FastFieldRangeWeight - pub(crate) fn new(field: Field, bounds: BoundsRange) -> Self { - Self { bounds, field } + pub(crate) fn new(bounds: BoundsRange) -> Self { + Self { bounds } } } @@ -46,12 +44,12 @@ impl Weight for FastFieldRangeWeight { if self.bounds.is_unbounded() { return Ok(Box::new(AllScorer::new(reader.max_doc()))); } - let field_type = reader.schema().get_field_entry(self.field).field_type(); let term = self .bounds .get_inner() .expect("At least one bound must be set"); + let field_type = reader.schema().get_field_entry(term.field()).field_type(); assert_eq!( term.typ(), field_type.value_type(), @@ -62,10 +60,6 @@ impl Weight for FastFieldRangeWeight { let field_name = term.get_full_path(reader.schema()); let get_value_bytes = |term: &Term| term.value().value_bytes_payload(); - let get_term_u64_internal_representation = |term: &Term| { - let bytes = term.value().value_bytes_payload(); - u64::from_be(BinarySerializable::deserialize(&mut &bytes[..]).unwrap()) - }; let term_value = term.value(); if field_type.is_json() { @@ -175,11 +169,35 @@ impl Weight for FastFieldRangeWeight { field_type ); - let bounds = self.bounds.map_bound(get_term_u64_internal_representation); + let bounds = self.bounds.map_bound_res(|term| { + let value = term.value(); + let val = if let Some(val) = value.as_u64() { + val + } else if let Some(val) = value.as_i64() { + val.to_u64() + } else if let Some(val) = value.as_f64() { + val.to_u64() + } else if let Some(val) = value.as_date() { + val.to_u64() + } else { + return Err(TantivyError::InvalidArgument(format!( + "Expected term with u64, i64, f64 or date, but got {:?}", + term + ))); + }; + Ok(val) + })?; let fast_field_reader = reader.fast_fields(); - let Some((column, _col_type)) = - fast_field_reader.u64_lenient_for_type(None, &field_name)? + let Some((column, _col_type)) = fast_field_reader.u64_lenient_for_type( + Some(&[ + ColumnType::U64, + ColumnType::I64, + ColumnType::F64, + ColumnType::DateTime, + ]), + &field_name, + )? else { return Ok(Box::new(EmptyScorer)); }; @@ -212,7 +230,7 @@ fn search_on_json_numerical_field( boost: Score, ) -> crate::Result> { // Since we don't know which type was interpolated for the internal column whe - // have to check for all types (only one exists) + // have to check for all numeric types (only one exists) let allowed_column_types: Option<&[ColumnType]> = Some(&[ColumnType::F64, ColumnType::I64, ColumnType::U64]); let fast_field_reader = reader.fast_fields(); @@ -455,7 +473,8 @@ pub mod tests { use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight; use crate::query::{QueryParser, RangeQuery, Weight}; use crate::schema::{ - Field, NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT, + DateOptions, Field, NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, + TEXT, }; use crate::{Index, IndexWriter, Term, TERMINATED}; @@ -518,6 +537,113 @@ pub mod tests { Ok(()) } + #[test] + fn test_date_range_query() { + let mut schema_builder = Schema::builder(); + let options = DateOptions::default() + .set_precision(common::DateTimePrecision::Microseconds) + .set_fast(); + let date_field = schema_builder.add_date_field("date", options); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema.clone()); + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + // This is added a string and creates a string column! + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer.commit().unwrap(); + } + + // Date field + let dt1 = + DateTime::from_utc(OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap()); + let dt2 = + DateTime::from_utc(OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap()); + let dt3 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ); + let dt4 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(), + ); + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let query_parser = QueryParser::for_index(&index, vec![date_field]); + let test_query = |query, num_hits| { + let query = query_parser.parse_query(query).unwrap(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + assert_eq!(top_docs.len(), num_hits); + }; + + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.001Z]", + 1, + ); + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z}", + 1, + ); + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 1, + ); + test_query( + "date:{2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 0, + ); + + let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt3)), + Bound::Excluded(Term::from_field_date(date_field, dt4)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt3)), + Bound::Included(Term::from_field_date(date_field, dt4)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt1)), + Bound::Included(Term::from_field_date(date_field, dt2)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt1)), + Bound::Excluded(Term::from_field_date(date_field, dt2)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Excluded(Term::from_field_date(date_field, dt1)), + Bound::Excluded(Term::from_field_date(date_field, dt2)), + )), + 0 + ); + } + fn get_json_term(field: Field, path: &str, value: T) -> Term { let mut term = Term::from_field_json_path(field, path, true); term.append_type_and_fast_value(value); @@ -548,6 +674,10 @@ pub mod tests { "date": "2023-12-01T00:00:01Z" }); index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "date": "2015-02-01T00:00:00.001Z" + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); index_writer.commit().unwrap(); } @@ -631,6 +761,13 @@ pub mod tests { )), 2 ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", 1000i64)), + Bound::Excluded(get_json_term(json_field, "id_i64", 1001i64)), + )), + 1 + ); // u64 on i64 assert_eq!( @@ -691,6 +828,32 @@ pub mod tests { 1 ); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let query_parser = QueryParser::for_index(&index, vec![json_field]); + let test_query = |query, num_hits| { + let query = query_parser.parse_query(query).unwrap(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + assert_eq!(top_docs.len(), num_hits); + }; + + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.001Z]", + 1, + ); + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z}", + 1, + ); + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 1, + ); + test_query( + "json.date:{2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 0, + ); + // Date field let dt1 = DateTime::from_utc(OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap()); @@ -718,6 +881,18 @@ pub mod tests { )), 0 ); + // Date precision test. We don't want to truncate the precision + let dt3 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ); + let dt4 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(), + ); + let query = RangeQuery::new( + Bound::Included(get_json_term(json_field, "date", dt3)), + Bound::Excluded(get_json_term(json_field, "date", dt4)), + ); + assert_eq!(count(query), 1); } #[derive(Clone, Debug)] @@ -796,13 +971,10 @@ pub mod tests { writer.add_document(doc!(field=>52_000u64)).unwrap(); writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); - let range_query = FastFieldRangeWeight::new( - field, - BoundsRange::new( - Bound::Included(Term::from_field_u64(field, 50_000)), - Bound::Included(Term::from_field_u64(field, 50_002)), - ), - ); + let range_query = FastFieldRangeWeight::new(BoundsRange::new( + Bound::Included(Term::from_field_u64(field, 50_000)), + Bound::Included(Term::from_field_u64(field, 50_002)), + )); let scorer = range_query .scorer(searcher.segment_reader(0), 1.0f32) .unwrap(); @@ -1158,13 +1330,10 @@ pub mod ip_range_tests { } writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); - let range_weight = FastFieldRangeWeight::new( - ips_field, - BoundsRange::new( - Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])), - Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])), - ), - ); + let range_weight = FastFieldRangeWeight::new(BoundsRange::new( + Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])), + Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])), + )); let count = crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap(); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 2c1253737e..8b203f5b37 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -102,6 +102,9 @@ const ALL_TYPES: [Type; 10] = [ ]; impl Type { + /// Returns the numerical type if applicable + /// It does not do any mapping, e.g. Date is None although it's also stored as I64 in the + /// column store pub fn numerical_type(&self) -> Option { match self { Type::I64 => Some(NumericalType::I64), diff --git a/src/schema/term.rs b/src/schema/term.rs index 58c81c785a..349b11fc4f 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -2,7 +2,7 @@ use std::hash::{Hash, Hasher}; use std::net::Ipv6Addr; use std::{fmt, str}; -use columnar::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; +use columnar::MonotonicallyMappableToU128; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP_STR}; use common::JsonPathWriter; @@ -137,8 +137,20 @@ impl Term { Term::from_fast_value(field, &val) } - /// Builds a term given a field, and a `DateTime` value + /// Builds a term given a field, and a `DateTime` value. + /// + /// The contained value may not match the value, due do the truncation used + /// for indexed data [super::DATE_TIME_PRECISION_INDEXED]. + /// To create a term used for search use `from_field_date_for_search`. pub fn from_field_date(field: Field, val: DateTime) -> Term { + Term::from_fast_value(field, &val) + } + + /// Builds a term given a field, and a `DateTime` value to be used in searching the inverted + /// index. + /// It truncates the `DateTime` to the precision used in the index + /// ([super::DATE_TIME_PRECISION_INDEXED]). + pub fn from_field_date_for_search(field: Field, val: DateTime) -> Term { Term::from_fast_value(field, &val.truncate(DATE_TIME_PRECISION_INDEXED)) } @@ -210,13 +222,7 @@ impl Term { /// It will not clear existing bytes. pub fn append_type_and_fast_value(&mut self, val: T) { self.0.push(T::to_type().to_code()); - let value = if T::to_type() == Type::Date { - DateTime::from_u64(val.to_u64()) - .truncate(DATE_TIME_PRECISION_INDEXED) - .to_u64() - } else { - val.to_u64() - }; + let value = val.to_u64(); self.0.extend(value.to_be_bytes().as_ref()); }