From 509adab79d4402c659dcb27593f9676e2c369d5a Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 12 Dec 2022 04:39:43 +0100 Subject: [PATCH 1/3] Bump version (#1715) * group workspace deps * update cargo.toml * revert tant version * chore: Release --- Cargo.toml | 13 +++++++------ bitpacker/Cargo.toml | 4 +++- common/Cargo.toml | 8 ++++++-- fastfield_codecs/Cargo.toml | 11 +++++++---- ownedbytes/Cargo.toml | 6 +++++- query-grammar/Cargo.toml | 2 +- 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0c340ceca1..77fede6348 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.19.0-dev" +version = "0.19.0" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] @@ -36,11 +36,6 @@ fs2 = { version = "0.4.3", optional = true } levenshtein_automata = "0.2.1" uuid = { version = "1.0.0", features = ["v4", "serde"] } crossbeam-channel = "0.5.4" -tantivy-query-grammar = { version="0.18.0", path="./query-grammar" } -tantivy-bitpacker = { version="0.2", path="./bitpacker" } -common = { version = "0.3", path = "./common/", package = "tantivy-common" } -fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false } -ownedbytes = { version="0.3", path="./ownedbytes" } stable_deref_trait = "1.2.0" rust-stemmers = "1.2.0" downcast-rs = "1.2.0" @@ -62,6 +57,12 @@ ciborium = { version = "0.2", optional = true} async-trait = "0.1.53" arc-swap = "1.5.0" +tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" } +tantivy-bitpacker = { version= "0.3", path="./bitpacker" } +common = { version= "0.4", path = "./common/", package = "tantivy-common" } +fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false } +ownedbytes = { version= "0.4", path="./ownedbytes" } + [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/bitpacker/Cargo.toml b/bitpacker/Cargo.toml index 48c1b8a1c0..8a3db03fd2 100644 --- a/bitpacker/Cargo.toml +++ b/bitpacker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy-bitpacker" -version = "0.2.0" +version = "0.3.0" edition = "2021" authors = ["Paul Masurel "] license = "MIT" @@ -8,6 +8,8 @@ categories = [] description = """Tantivy-sub crate: bitpacking""" repository = "https://github.com/quickwit-oss/tantivy" keywords = [] +documentation = "https://docs.rs/tantivy-bitpacker/latest/tantivy_bitpacker" +homepage = "https://github.com/quickwit-oss/tantivy" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/common/Cargo.toml b/common/Cargo.toml index f7085d9c13..e579a9aab5 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -1,16 +1,20 @@ [package] name = "tantivy-common" -version = "0.3.0" +version = "0.4.0" authors = ["Paul Masurel ", "Pascal Seitz "] license = "MIT" edition = "2021" description = "common traits and utility functions used by multiple tantivy subcrates" +documentation = "https://docs.rs/tantivy_common/" +homepage = "https://github.com/quickwit-oss/tantivy" +repository = "https://github.com/quickwit-oss/tantivy" + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] byteorder = "1.4.3" -ownedbytes = { version="0.3", path="../ownedbytes" } +ownedbytes = { version= "0.4", path="../ownedbytes" } [dev-dependencies] proptest = "1.0.0" diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index a53a8ceeb5..a56d0f9835 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -1,17 +1,20 @@ [package] name = "fastfield_codecs" -version = "0.2.0" +version = "0.3.0" authors = ["Pascal Seitz "] license = "MIT" edition = "2021" description = "Fast field codecs used by tantivy" +documentation = "https://docs.rs/fastfield_codecs/" +homepage = "https://github.com/quickwit-oss/tantivy" +repository = "https://github.com/quickwit-oss/tantivy" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -common = { version = "0.3", path = "../common/", package = "tantivy-common" } -tantivy-bitpacker = { version="0.2", path = "../bitpacker/" } -ownedbytes = { version = "0.3.0", path = "../ownedbytes" } +common = { version = "0.4", path = "../common/", package = "tantivy-common" } +tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" } +ownedbytes = { version = "0.4.0", path = "../ownedbytes" } prettytable-rs = {version="0.9.0", optional= true} rand = {version="0.8.3", optional= true} fastdivide = "0.4" diff --git a/ownedbytes/Cargo.toml b/ownedbytes/Cargo.toml index b06db9f930..4bd3206efd 100644 --- a/ownedbytes/Cargo.toml +++ b/ownedbytes/Cargo.toml @@ -1,10 +1,14 @@ [package] authors = ["Paul Masurel ", "Pascal Seitz "] name = "ownedbytes" -version = "0.3.0" +version = "0.4.0" edition = "2021" description = "Expose data as static slice" license = "MIT" +documentation = "https://docs.rs/ownedbytes/" +homepage = "https://github.com/quickwit-oss/tantivy" +repository = "https://github.com/quickwit-oss/tantivy" + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml index 02c967bbcf..91b1c4ad0f 100644 --- a/query-grammar/Cargo.toml +++ b/query-grammar/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy-query-grammar" -version = "0.18.0" +version = "0.19.0" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] From 2c50b02eb3dcea0f8c6115c7d15cc73cfc5d5db7 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 12 Dec 2022 04:40:15 +0100 Subject: [PATCH 2/3] Fix max bucket limit in histogram (#1703) * Fix max bucket limit in histogram The max bucket limit in histogram was broken, since some code introduced temporary filtering of buckets, which then resulted into an incorrect increment on the bucket count. The provided solution covers more scenarios, but there are still some scenarios unhandled (See #1702). * Apply suggestions from code review Co-authored-by: Paul Masurel Co-authored-by: Paul Masurel --- src/aggregation/bucket/histogram/histogram.rs | 63 +++++++++++++++++-- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index fb59693b9c..8472628069 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -206,6 +206,7 @@ pub struct SegmentHistogramCollector { field_type: Type, interval: f64, offset: f64, + min_doc_count: u64, first_bucket_num: i64, bounds: HistogramBounds, } @@ -215,6 +216,30 @@ impl SegmentHistogramCollector { self, agg_with_accessor: &BucketAggregationWithAccessor, ) -> crate::Result { + // Compute the number of buckets to validate against max num buckets + // Note: We use min_doc_count here, but it's only an lowerbound here, since were are on the + // intermediate level and after merging the number of documents of a bucket could exceed + // `min_doc_count`. + { + let cut_off_buckets_front = self + .buckets + .iter() + .take_while(|bucket| bucket.doc_count <= self.min_doc_count) + .count(); + let cut_off_buckets_back = self.buckets[cut_off_buckets_front..] + .iter() + .rev() + .take_while(|bucket| bucket.doc_count <= self.min_doc_count) + .count(); + let estimate_num_buckets = + self.buckets.len() - cut_off_buckets_front - cut_off_buckets_back; + + agg_with_accessor + .bucket_count + .add_count(estimate_num_buckets as u32); + agg_with_accessor.bucket_count.validate_bucket_count()?; + } + let mut buckets = Vec::with_capacity( self.buckets .iter() @@ -251,11 +276,6 @@ impl SegmentHistogramCollector { ); }; - agg_with_accessor - .bucket_count - .add_count(buckets.len() as u32); - agg_with_accessor.bucket_count.validate_bucket_count()?; - Ok(IntermediateBucketResult::Histogram { buckets }) } @@ -308,6 +328,7 @@ impl SegmentHistogramCollector { first_bucket_num, bounds, sub_aggregations, + min_doc_count: req.min_doc_count(), }) } @@ -1521,4 +1542,36 @@ mod tests { Ok(()) } + + #[test] + fn histogram_test_max_buckets_segments() -> crate::Result<()> { + let values = vec![0.0, 70000.0]; + + let index = get_test_index_from_values(true, &values)?; + + let agg_req: Aggregations = vec![( + "my_interval".to_string(), + Aggregation::Bucket(BucketAggregation { + bucket_agg: BucketAggregationType::Histogram(HistogramAggregation { + field: "score_f64".to_string(), + interval: 1.0, + ..Default::default() + }), + sub_aggregation: Default::default(), + }), + )] + .into_iter() + .collect(); + + let res = exec_request(agg_req, &index); + + assert_eq!( + res.unwrap_err().to_string(), + "An invalid argument was passed: 'Aborting aggregation because too many buckets were \ + created'" + .to_string() + ); + + Ok(()) + } } From 5d4535de83d214765c29501f0418f1ff38a607b1 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 12 Dec 2022 06:28:42 +0100 Subject: [PATCH 3/3] Changelog fix (#1717) --- CHANGELOG.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78f93d66c2..6a767157b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,22 +2,21 @@ Tantivy 0.19 ================================ #### Bugfixes - Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz) -- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton) +- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480) (@PSeitz @fulmicoton) #### Features/Improvements - Add support for `IN` in queryparser , e.g. `field: IN [val1 val2 val3]` [#1683](https://github.com/quickwit-oss/tantivy/pull/1683) (@trinity-1686a) - Skip score calculation, when no scoring is required [#1646](https://github.com/quickwit-oss/tantivy/pull/1646) (@PSeitz) - Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz) -- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396) - The `DateTime` type has been updated to hold timestamps with microseconds precision. - `DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing). (@evanxg852000) +- The `DateTime` type has been updated to hold timestamps with microseconds precision. + `DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing) [#1396](https://github.com/quickwit-oss/tantivy/pull/1396) (@evanxg852000) - Add IP address field type [#1553](https://github.com/quickwit-oss/tantivy/pull/1553) (@PSeitz) - Add boolean field type [#1382](https://github.com/quickwit-oss/tantivy/pull/1382) (@boraarslan) - Remove Searcher pool and make `Searcher` cloneable. (@PSeitz) -- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570 (@PSeitz) +- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570) (@PSeitz) - Detect and apply gcd on fastfield codecs [#1418](https://github.com/quickwit-oss/tantivy/pull/1418) (@PSeitz) - Doc store - - use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510 (@PSeitz @fulmicoton) + - use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510) (@PSeitz @fulmicoton) - Expose doc store cache size [#1403](https://github.com/quickwit-oss/tantivy/pull/1403) (@PSeitz) - Enable compression levels for doc store [#1378](https://github.com/quickwit-oss/tantivy/pull/1378) (@PSeitz) - Make block size configurable [#1374](https://github.com/quickwit-oss/tantivy/pull/1374) (@kryesh)