Skip to content

Commit

Permalink
Merge branch 'main' into stacker-sstable-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton committed Dec 12, 2022
2 parents c7f6bdc + 5d4535d commit dddebc2
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 30 deletions.
11 changes: 5 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@ Tantivy 0.19
================================
#### Bugfixes
- Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton)
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480) (@PSeitz @fulmicoton)

#### Features/Improvements
- Add support for `IN` in queryparser , e.g. `field: IN [val1 val2 val3]` [#1683](https://github.com/quickwit-oss/tantivy/pull/1683) (@trinity-1686a)
- Skip score calculation, when no scoring is required [#1646](https://github.com/quickwit-oss/tantivy/pull/1646) (@PSeitz)
- Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz)
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
The `DateTime` type has been updated to hold timestamps with microseconds precision.
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing). (@evanxg852000)
- The `DateTime` type has been updated to hold timestamps with microseconds precision.
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing) [#1396](https://github.com/quickwit-oss/tantivy/pull/1396) (@evanxg852000)
- Add IP address field type [#1553](https://github.com/quickwit-oss/tantivy/pull/1553) (@PSeitz)
- Add boolean field type [#1382](https://github.com/quickwit-oss/tantivy/pull/1382) (@boraarslan)
- Remove Searcher pool and make `Searcher` cloneable. (@PSeitz)
- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570 (@PSeitz)
- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570) (@PSeitz)
- Detect and apply gcd on fastfield codecs [#1418](https://github.com/quickwit-oss/tantivy/pull/1418) (@PSeitz)
- Doc store
- use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510 (@PSeitz @fulmicoton)
- use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510) (@PSeitz @fulmicoton)
- Expose doc store cache size [#1403](https://github.com/quickwit-oss/tantivy/pull/1403) (@PSeitz)
- Enable compression levels for doc store [#1378](https://github.com/quickwit-oss/tantivy/pull/1378) (@PSeitz)
- Make block size configurable [#1374](https://github.com/quickwit-oss/tantivy/pull/1374) (@kryesh)
Expand Down
18 changes: 10 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.0-dev"
version = "0.19.0"
authors = ["Paul Masurel <[email protected]>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
Expand Down Expand Up @@ -36,13 +36,6 @@ fs2 = { version = "0.4.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.3", path="./ownedbytes" }
stable_deref_trait = "1.2.0"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
Expand All @@ -63,6 +56,15 @@ measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"

tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.4", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
ownedbytes = { version= "0.4", path="./ownedbytes" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }


[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"

Expand Down
4 changes: 3 additions & 1 deletion bitpacker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
[package]
name = "tantivy-bitpacker"
version = "0.2.0"
version = "0.3.0"
edition = "2021"
authors = ["Paul Masurel <[email protected]>"]
license = "MIT"
categories = []
description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/quickwit-oss/tantivy"
keywords = []
documentation = "https://docs.rs/tantivy-bitpacker/latest/tantivy_bitpacker"
homepage = "https://github.com/quickwit-oss/tantivy"


# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
8 changes: 6 additions & 2 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
[package]
name = "tantivy-common"
version = "0.3.0"
version = "0.4.0"
authors = ["Paul Masurel <[email protected]>", "Pascal Seitz <[email protected]>"]
license = "MIT"
edition = "2021"
description = "common traits and utility functions used by multiple tantivy subcrates"
documentation = "https://docs.rs/tantivy_common/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"


# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
byteorder = "1.4.3"
ownedbytes = { version="0.3", path="../ownedbytes" }
ownedbytes = { version= "0.4", path="../ownedbytes" }

[dev-dependencies]
proptest = "1.0.0"
Expand Down
11 changes: 7 additions & 4 deletions fastfield_codecs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
[package]
name = "fastfield_codecs"
version = "0.2.0"
version = "0.3.0"
authors = ["Pascal Seitz <[email protected]>"]
license = "MIT"
edition = "2021"
description = "Fast field codecs used by tantivy"
documentation = "https://docs.rs/fastfield_codecs/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
common = { version = "0.4", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
ownedbytes = { version = "0.4.0", path = "../ownedbytes" }
prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"
Expand Down
6 changes: 5 additions & 1 deletion ownedbytes/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
[package]
authors = ["Paul Masurel <[email protected]>", "Pascal Seitz <[email protected]>"]
name = "ownedbytes"
version = "0.3.0"
version = "0.4.0"
edition = "2021"
description = "Expose data as static slice"
license = "MIT"
documentation = "https://docs.rs/ownedbytes/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion query-grammar/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.18.0"
version = "0.19.0"
authors = ["Paul Masurel <[email protected]>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
Expand Down
63 changes: 58 additions & 5 deletions src/aggregation/bucket/histogram/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ pub struct SegmentHistogramCollector {
field_type: Type,
interval: f64,
offset: f64,
min_doc_count: u64,
first_bucket_num: i64,
bounds: HistogramBounds,
}
Expand All @@ -215,6 +216,30 @@ impl SegmentHistogramCollector {
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
// Compute the number of buckets to validate against max num buckets
// Note: We use min_doc_count here, but it's only an lowerbound here, since were are on the
// intermediate level and after merging the number of documents of a bucket could exceed
// `min_doc_count`.
{
let cut_off_buckets_front = self
.buckets
.iter()
.take_while(|bucket| bucket.doc_count <= self.min_doc_count)
.count();
let cut_off_buckets_back = self.buckets[cut_off_buckets_front..]
.iter()
.rev()
.take_while(|bucket| bucket.doc_count <= self.min_doc_count)
.count();
let estimate_num_buckets =
self.buckets.len() - cut_off_buckets_front - cut_off_buckets_back;

agg_with_accessor
.bucket_count
.add_count(estimate_num_buckets as u32);
agg_with_accessor.bucket_count.validate_bucket_count()?;
}

let mut buckets = Vec::with_capacity(
self.buckets
.iter()
Expand Down Expand Up @@ -251,11 +276,6 @@ impl SegmentHistogramCollector {
);
};

agg_with_accessor
.bucket_count
.add_count(buckets.len() as u32);
agg_with_accessor.bucket_count.validate_bucket_count()?;

Ok(IntermediateBucketResult::Histogram { buckets })
}

Expand Down Expand Up @@ -308,6 +328,7 @@ impl SegmentHistogramCollector {
first_bucket_num,
bounds,
sub_aggregations,
min_doc_count: req.min_doc_count(),
})
}

Expand Down Expand Up @@ -1521,4 +1542,36 @@ mod tests {

Ok(())
}

#[test]
fn histogram_test_max_buckets_segments() -> crate::Result<()> {
let values = vec![0.0, 70000.0];

let index = get_test_index_from_values(true, &values)?;

let agg_req: Aggregations = vec![(
"my_interval".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 1.0,
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();

let res = exec_request(agg_req, &index);

assert_eq!(
res.unwrap_err().to_string(),
"An invalid argument was passed: 'Aborting aggregation because too many buckets were \
created'"
.to_string()
);

Ok(())
}
}
2 changes: 1 addition & 1 deletion sstable/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
common = {path="../common", package="tantivy-common"}
common = { version = "0.4", path="../common", package="tantivy-common"}
ciborium = "0.2"
byteorder = "1"
serde = "1"
2 changes: 1 addition & 1 deletion stacker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ edition = "2021"
[dependencies]
murmurhash32 = "0.2"
byteorder = "1"
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
common = { version = "0.4", path = "../common/", package = "tantivy-common" }

0 comments on commit dddebc2

Please sign in to comment.