From b994dcc416f4b5efd14aa3db3b866c1505c39592 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 22 Apr 2024 14:29:16 +0800 Subject: [PATCH] add json path constructor to term --- src/core/json_utils.rs | 38 ++++++-------------------- src/core/tests.rs | 5 ++-- src/indexer/segment_writer.rs | 37 ++++++++++++------------- src/query/query_parser/query_parser.rs | 14 ++-------- src/schema/term.rs | 28 +++++++++++++++++-- 5 files changed, 57 insertions(+), 65 deletions(-) diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index d7ac29ad7d..1f459ff3ae 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -4,7 +4,7 @@ use rustc_hash::FxHashMap; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; -use crate::schema::{Field, Type}; +use crate::schema::Type; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -349,44 +349,24 @@ pub(crate) fn encode_column_name( path.into() } -pub fn term_from_json_paths<'a>( - json_field: Field, - paths: impl Iterator, - expand_dots_enabled: bool, -) -> Term { - let mut json_path = JsonPathWriter::with_expand_dots(expand_dots_enabled); - for path in paths { - json_path.push(path); - } - json_path.set_end(); - let mut term = Term::with_type_and_field(Type::Json, json_field); - - term.append_bytes(json_path.as_str().as_bytes()); - term -} - #[cfg(test)] mod tests { use super::split_json_path; - use crate::json_utils::term_from_json_paths; use crate::schema::Field; + use crate::Term; #[test] fn test_json_writer() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["attributes", "color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "attributes.color", false); term.append_type_and_str("red"); assert_eq!( format!("{:?}", term), "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")" ); - let mut term = term_from_json_paths( - field, - ["attributes", "dimensions", "width"].into_iter(), - false, - ); + let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false); term.append_type_and_fast_value(400i64); assert_eq!( format!("{:?}", term), @@ -397,7 +377,7 @@ mod tests { #[test] fn test_string_term() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_str("red"); assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred") @@ -406,7 +386,7 @@ mod tests { #[test] fn test_i64_term() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(-4i64); assert_eq!( @@ -418,7 +398,7 @@ mod tests { #[test] fn test_u64_term() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(4u64); assert_eq!( @@ -430,7 +410,7 @@ mod tests { #[test] fn test_f64_term() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(4.0f64); assert_eq!( term.serialized_term(), @@ -441,7 +421,7 @@ mod tests { #[test] fn test_bool_term() { let field = Field::from_field_id(1); - let mut term = term_from_json_paths(field, ["color"].into_iter(), false); + let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(true); assert_eq!( term.serialized_term(), diff --git a/src/core/tests.rs b/src/core/tests.rs index 210b359da8..2efa77b220 100644 --- a/src/core/tests.rs +++ b/src/core/tests.rs @@ -1,7 +1,6 @@ use crate::collector::Count; use crate::directory::{RamDirectory, WatchCallback}; use crate::indexer::{LogMergePolicy, NoMergePolicy}; -use crate::json_utils::term_from_json_paths; use crate::query::TermQuery; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT}; use crate::tokenizer::TokenizerManager; @@ -417,7 +416,7 @@ fn test_non_text_json_term_freq() { let segment_reader = searcher.segment_reader(0u32); let inv_idx = segment_reader.inverted_index(field).unwrap(); - let mut term = term_from_json_paths(field, ["tenant_id"].iter().cloned(), false); + let mut term = Term::from_field_json_path(field, "tenant_id", false); term.append_type_and_fast_value(75u64); let postings = inv_idx @@ -451,7 +450,7 @@ fn test_non_text_json_term_freq_bitpacked() { let segment_reader = searcher.segment_reader(0u32); let inv_idx = segment_reader.inverted_index(field).unwrap(); - let mut term = term_from_json_paths(field, ["tenant_id"].iter().cloned(), false); + let mut term = Term::from_field_json_path(field, "tenant_id", false); term.append_type_and_fast_value(75u64); let mut postings = inv_idx diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 384a939e6d..6f30935d10 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -498,7 +498,6 @@ mod tests { use crate::collector::{Count, TopDocs}; use crate::directory::RamDirectory; use crate::fastfield::FastValue; - use crate::json_utils::term_from_json_paths; use crate::postings::TermInfo; use crate::query::{PhraseQuery, QueryParser}; use crate::schema::document::Value; @@ -647,9 +646,8 @@ mod tests { let mut term_stream = term_dict.stream().unwrap(); - let term_from_path = |paths: &[&str]| -> Term { - term_from_json_paths(json_field, paths.iter().cloned(), false) - }; + let term_from_path = + |path: &str| -> Term { Term::from_field_json_path(json_field, path, false) }; fn set_fast_val(val: T, mut term: Term) -> Term { term.append_type_and_fast_value(val); @@ -660,15 +658,14 @@ mod tests { term } - let term = term_from_path(&["bool"]); + let term = term_from_path("bool"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_fast_val(true, term).serialized_value_bytes() ); - let term = term_from_path(&["complexobject", "field.with.dot"]); - + let term = term_from_path("complexobject.field\\.with\\.dot"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), @@ -676,7 +673,7 @@ mod tests { ); // Date - let term = term_from_path(&["date"]); + let term = term_from_path("date"); assert!(term_stream.advance()); assert_eq!( @@ -691,7 +688,7 @@ mod tests { ); // Float - let term = term_from_path(&["float"]); + let term = term_from_path("float"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), @@ -699,21 +696,21 @@ mod tests { ); // Number In Array - let term = term_from_path(&["my_arr"]); + let term = term_from_path("my_arr"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_fast_val(2i64, term).serialized_value_bytes() ); - let term = term_from_path(&["my_arr"]); + let term = term_from_path("my_arr"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_fast_val(3i64, term).serialized_value_bytes() ); - let term = term_from_path(&["my_arr"]); + let term = term_from_path("my_arr"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), @@ -721,13 +718,13 @@ mod tests { ); // El in Array - let term = term_from_path(&["my_arr", "my_key"]); + let term = term_from_path("my_arr.my_key"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_str("tokens", term).serialized_value_bytes() ); - let term = term_from_path(&["my_arr", "my_key"]); + let term = term_from_path("my_arr.my_key"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), @@ -735,21 +732,21 @@ mod tests { ); // Signed - let term = term_from_path(&["signed"]); + let term = term_from_path("signed"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_fast_val(-2i64, term).serialized_value_bytes() ); - let term = term_from_path(&["toto"]); + let term = term_from_path("toto"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), set_str("titi", term).serialized_value_bytes() ); // Unsigned - let term = term_from_path(&["unsigned"]); + let term = term_from_path("unsigned"); assert!(term_stream.advance()); assert_eq!( term_stream.key(), @@ -776,7 +773,7 @@ mod tests { let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0u32); let inv_index = segment_reader.inverted_index(json_field).unwrap(); - let mut term = term_from_json_paths(json_field, ["mykey"].into_iter(), false); + let mut term = Term::from_field_json_path(json_field, "mykey", false); term.append_type_and_str("token"); let term_info = inv_index.get_term_info(&term).unwrap().unwrap(); assert_eq!( @@ -815,7 +812,7 @@ mod tests { let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0u32); let inv_index = segment_reader.inverted_index(json_field).unwrap(); - let mut term = term_from_json_paths(json_field, ["mykey"].into_iter(), false); + let mut term = Term::from_field_json_path(json_field, "mykey", false); term.append_type_and_str("two tokens"); let term_info = inv_index.get_term_info(&term).unwrap().unwrap(); assert_eq!( @@ -856,7 +853,7 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let term = term_from_json_paths(json_field, ["mykey", "field"].into_iter(), false); + let term = Term::from_field_json_path(json_field, "mykey.field", false); let mut hello_term = term.clone(); hello_term.append_type_and_str("hello"); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index aedd0c433a..d668e8eaf9 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -11,9 +11,7 @@ use rustc_hash::FxHashMap; use super::logical_ast::*; use crate::index::Index; -use crate::json_utils::{ - convert_to_fast_value_and_append_to_json_term, split_json_path, term_from_json_paths, -}; +use crate::json_utils::convert_to_fast_value_and_append_to_json_term; use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery, @@ -966,14 +964,8 @@ fn generate_literals_for_json_object( let index_record_option = text_options.index_option(); let mut logical_literals = Vec::new(); - let paths = split_json_path(json_path); - let get_term_with_path = || { - term_from_json_paths( - field, - paths.iter().map(|el| el.as_str()), - json_options.is_expand_dots_enabled(), - ) - }; + let get_term_with_path = + || Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled()); // Try to convert the phrase to a fast value if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase) diff --git a/src/schema/term.rs b/src/schema/term.rs index 3ac5d0ac4b..b4d0d6288b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -4,10 +4,12 @@ use std::{fmt, str}; use columnar::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP_STR}; +use common::JsonPathWriter; use super::date_time_options::DATE_TIME_PRECISION_INDEXED; use super::Field; use crate::fastfield::FastValue; +use crate::json_utils::split_json_path; use crate::schema::{Facet, Type}; use crate::DateTime; @@ -33,6 +35,28 @@ impl Term { Term(data) } + /// Creates a term from a json path. + /// + /// The json path can address a nested value in a JSON object. + /// e.g. `{"k8s": {"node": {"id": 5}}}` can be addressed via `k8s.node.id`. + /// + /// In case there are dots in the field name, and the `expand_dots_enabled` parameter is not + /// set they need to be escaped with a backslash. + /// e.g. `{"k8s.node": {"id": 5}}` can be addressed via `k8s\.node.id`. + pub fn from_field_json_path(field: Field, json_path: &str, expand_dots_enabled: bool) -> Term { + let paths = split_json_path(json_path); + let mut json_path = JsonPathWriter::with_expand_dots(expand_dots_enabled); + for path in paths { + json_path.push(&path); + } + json_path.set_end(); + let mut term = Term::with_type_and_field(Type::Json, field); + + term.append_bytes(json_path.as_str().as_bytes()); + + term + } + pub(crate) fn with_type_and_field(typ: Type, field: Field) -> Term { let mut term = Self::with_capacity(8); term.set_field_and_type(field, typ); @@ -165,7 +189,7 @@ impl Term { /// This is used in JSON type to append a fast value after the path. /// /// It will not clear existing bytes. - pub(crate) fn append_type_and_fast_value(&mut self, val: T) { + pub fn append_type_and_fast_value(&mut self, val: T) { self.0.push(T::to_type().to_code()); let value = if T::to_type() == Type::Date { DateTime::from_u64(val.to_u64()) @@ -181,7 +205,7 @@ impl Term { /// This is used in JSON type to append a str after the path. /// /// It will not clear existing bytes. - pub(crate) fn append_type_and_str(&mut self, val: &str) { + pub fn append_type_and_str(&mut self, val: &str) { self.0.push(Type::Str.to_code()); self.0.extend(val.as_bytes().as_ref()); }