From cd956e6b2166139655cae172c9e786fb320d29c3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 12 Mar 2024 16:36:29 +0800 Subject: [PATCH 1/2] expose buffer of collect_block --- src/collector/mod.rs | 4 ++++ src/docset.rs | 9 ++++++--- src/lib.rs | 2 +- src/query/all_query.rs | 20 ++++++++++---------- src/query/boolean_query/boolean_weight.rs | 4 ++-- src/query/boost_query.rs | 4 ++-- src/query/const_score_query.rs | 4 ++-- src/query/term_query/term_weight.rs | 4 ++-- src/query/vec_docset.rs | 16 ++++++++-------- src/query/weight.rs | 6 +++--- 10 files changed, 40 insertions(+), 33 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index de6c69f280..b78e020722 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -274,6 +274,10 @@ pub trait SegmentCollector: 'static { fn collect(&mut self, doc: DocId, score: Score); /// The query pushes the scored document to the collector via this method. + /// This method is used when the collector does not require scoring. + /// + /// See [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN) for the + /// buffer size passed to the collector. fn collect_block(&mut self, docs: &[DocId]) { for doc in docs { self.collect(*doc, 0.0); diff --git a/src/docset.rs b/src/docset.rs index 7f0b10c703..d04024d452 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -9,7 +9,10 @@ use crate::DocId; /// to compare `[u32; 4]`. pub const TERMINATED: DocId = i32::MAX as u32; -pub const BUFFER_LEN: usize = 64; +/// The collect_block method on `SegmentCollector` uses a buffer of this size. +/// Passed results to `collect_block` will not exceed this size and will be +/// exactly this size as long as we can fill the buffer. +pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64; /// Represents an iterable set of sorted doc ids. pub trait DocSet: Send { @@ -61,7 +64,7 @@ pub trait DocSet: Send { /// This method is only here for specific high-performance /// use case where batching. The normal way to /// go through the `DocId`'s is to call `.advance()`. - fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize { + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { if self.doc() == TERMINATED { return 0; } @@ -151,7 +154,7 @@ impl DocSet for Box { unboxed.seek(target) } - fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize { + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.fill_buffer(buffer) } diff --git a/src/lib.rs b/src/lib.rs index 1ac7ad21f6..11a0a2ae43 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -213,7 +213,7 @@ pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, HasLen}; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; -pub use self::docset::{DocSet, TERMINATED}; +pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; #[deprecated( since = "0.22.0", note = "Will be removed in tantivy 0.23. Use export from snippet module instead" diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 2d0ffa0ce0..149041b043 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -1,4 +1,4 @@ -use crate::docset::{DocSet, BUFFER_LEN, TERMINATED}; +use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::index::SegmentReader; use crate::query::boost_query::BoostScorer; use crate::query::explanation::does_not_match; @@ -54,7 +54,7 @@ impl DocSet for AllScorer { self.doc } - fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize { + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { if self.doc() == TERMINATED { return 0; } @@ -96,7 +96,7 @@ impl Scorer for AllScorer { #[cfg(test)] mod tests { use super::AllQuery; - use crate::docset::{DocSet, BUFFER_LEN, TERMINATED}; + use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::query::{AllScorer, EnableScoring, Query}; use crate::schema::{Schema, TEXT}; use crate::{Index, IndexWriter}; @@ -162,16 +162,16 @@ mod tests { pub fn test_fill_buffer() { let mut postings = AllScorer { doc: 0u32, - max_doc: BUFFER_LEN as u32 * 2 + 9, + max_doc: COLLECT_BLOCK_BUFFER_LEN as u32 * 2 + 9, }; - let mut buffer = [0u32; BUFFER_LEN]; - assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN); - for i in 0u32..BUFFER_LEN as u32 { + let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; + assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); + for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { assert_eq!(buffer[i as usize], i); } - assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN); - for i in 0u32..BUFFER_LEN as u32 { - assert_eq!(buffer[i as usize], i + BUFFER_LEN as u32); + assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); + for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { + assert_eq!(buffer[i as usize], i + COLLECT_BLOCK_BUFFER_LEN as u32); } assert_eq!(postings.fill_buffer(&mut buffer), 9); } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 094e0f5c03..ece6217d22 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::docset::BUFFER_LEN; +use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; use crate::postings::FreqReadingOption; use crate::query::explanation::does_not_match; @@ -228,7 +228,7 @@ impl Weight for BooleanWeight crate::Result<()> { let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; - let mut buffer = [0u32; BUFFER_LEN]; + let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; match scorer { SpecializedScorer::TermUnion(term_scorers) => { diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index e7c25114f6..4d2352d4d6 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -1,6 +1,6 @@ use std::fmt; -use crate::docset::BUFFER_LEN; +use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::fastfield::AliveBitSet; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, Term}; @@ -105,7 +105,7 @@ impl DocSet for BoostScorer { self.underlying.seek(target) } - fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize { + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { self.underlying.fill_buffer(buffer) } diff --git a/src/query/const_score_query.rs b/src/query/const_score_query.rs index 80f81fdfcd..8f27b82857 100644 --- a/src/query/const_score_query.rs +++ b/src/query/const_score_query.rs @@ -1,6 +1,6 @@ use std::fmt; -use crate::docset::BUFFER_LEN; +use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; @@ -119,7 +119,7 @@ impl DocSet for ConstScorer { self.docset.seek(target) } - fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize { + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { self.docset.fill_buffer(buffer) } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 61b9b10b59..a70c8ce8fa 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -1,5 +1,5 @@ use super::term_scorer::TermScorer; -use crate::docset::{DocSet, BUFFER_LEN}; +use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN}; use crate::fieldnorm::FieldNormReader; use crate::index::SegmentReader; use crate::postings::SegmentPostings; @@ -64,7 +64,7 @@ impl Weight for TermWeight { callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { let mut scorer = self.specialized_scorer(reader, 1.0)?; - let mut buffer = [0u32; BUFFER_LEN]; + let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; for_each_docset_buffered(&mut scorer, &mut buffer, callback); Ok(()) } diff --git a/src/query/vec_docset.rs b/src/query/vec_docset.rs index e0a7b9f6b0..f4e1b505f6 100644 --- a/src/query/vec_docset.rs +++ b/src/query/vec_docset.rs @@ -53,7 +53,7 @@ impl HasLen for VecDocSet { pub mod tests { use super::*; - use crate::docset::{DocSet, BUFFER_LEN}; + use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN}; use crate::DocId; #[test] @@ -72,16 +72,16 @@ pub mod tests { #[test] pub fn test_fill_buffer() { - let doc_ids: Vec = (1u32..=(BUFFER_LEN as u32 * 2 + 9)).collect(); + let doc_ids: Vec = (1u32..=(COLLECT_BLOCK_BUFFER_LEN as u32 * 2 + 9)).collect(); let mut postings = VecDocSet::from(doc_ids); - let mut buffer = [0u32; BUFFER_LEN]; - assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN); - for i in 0u32..BUFFER_LEN as u32 { + let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; + assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); + for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { assert_eq!(buffer[i as usize], i + 1); } - assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN); - for i in 0u32..BUFFER_LEN as u32 { - assert_eq!(buffer[i as usize], i + 1 + BUFFER_LEN as u32); + assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); + for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { + assert_eq!(buffer[i as usize], i + 1 + COLLECT_BLOCK_BUFFER_LEN as u32); } assert_eq!(postings.fill_buffer(&mut buffer), 9); } diff --git a/src/query/weight.rs b/src/query/weight.rs index 24e49bb5b9..23ff55c046 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,5 +1,5 @@ use super::Scorer; -use crate::docset::BUFFER_LEN; +use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; use crate::query::Explanation; use crate::{DocId, DocSet, Score, TERMINATED}; @@ -22,7 +22,7 @@ pub(crate) fn for_each_scorer( #[inline] pub(crate) fn for_each_docset_buffered( docset: &mut T, - buffer: &mut [DocId; BUFFER_LEN], + buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN], mut callback: impl FnMut(&[DocId]), ) { loop { @@ -105,7 +105,7 @@ pub trait Weight: Send + Sync + 'static { ) -> crate::Result<()> { let mut docset = self.scorer(reader, 1.0)?; - let mut buffer = [0u32; BUFFER_LEN]; + let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; for_each_docset_buffered(&mut docset, &mut buffer, callback); Ok(()) } From 73f79c562b0f58de28cf7557940b70dc163bfdfc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 12 Mar 2024 16:41:45 +0800 Subject: [PATCH 2/2] flip shard_size segment_size --- src/aggregation/bucket/term_agg.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index b5ada2dc46..fe1197274c 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -105,9 +105,9 @@ pub struct TermsAggregation { /// /// Defaults to 10 * size. #[serde(skip_serializing_if = "Option::is_none", default)] - #[serde(alias = "segment_size")] + #[serde(alias = "shard_size")] #[serde(alias = "split_size")] - pub shard_size: Option, + pub segment_size: Option, /// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will /// include doc_count_error_upper_bound, which is an upper bound to the error on the @@ -196,7 +196,7 @@ impl TermsAggregationInternal { pub(crate) fn from_req(req: &TermsAggregation) -> Self { let size = req.size.unwrap_or(10); - let mut segment_size = req.shard_size.unwrap_or(size * 10); + let mut segment_size = req.segment_size.unwrap_or(size * 10); let order = req.order.clone().unwrap_or_default(); segment_size = segment_size.max(size);