quickwit-oss · GodTamIt · Nov 14, 2023 · Nov 14, 2023 · Nov 15, 2023 · PSeitz
diff --git a/src/core/searcher.rs b/src/core/searcher.rs
@@ -1,7 +1,10 @@
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::{fmt, io};
 
+#[cfg(feature = "quickwit")]
+use futures_util::StreamExt;
+
 use crate::collector::Collector;
 use crate::core::{Executor, SegmentReader};
 use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
@@ -89,6 +92,23 @@ impl Searcher {
         store_reader.get(doc_address.doc_id)
     }
 
+    /// Fetches documents from tantivy's store given a list of [`DocAddress`].
+    ///
+    /// This method is more efficient than calling [`doc`](Self::doc) multiple times, as it batches
+    /// overlapping requests to segments and blocks.
+    pub fn docs<D: DocumentDeserialize>(
+        &self,
+        doc_addresses: &BTreeSet<DocAddress>,
+    ) -> crate::Result<HashMap<DocAddress, D>> {
+        // This implementation assumes that the `BlockCache` inside the `StoreReader` has non-zero
+        // capacity. This, combined with the fact that iteration of `doc_addresses` is ordered,
+        // allows for blocks to be re-used.
+        doc_addresses
+            .iter()
+            .map(|doc_address| Ok((*doc_address, self.doc(*doc_address)?)))
+            .collect()
+    }
+
     /// The cache stats for the underlying store reader.
     ///
     /// Aggregates the sum for each segment store reader.
@@ -112,6 +132,72 @@ impl Searcher {
         store_reader.get_async(doc_address.doc_id).await
     }
 
+    /// Fetches a set of documents in an asynchronous manner.
+    ///
+    /// This method is more efficient than calling [`doc_async`](Self::doc_async) multiple times, as
+    /// it batches overlapping requests to segments and blocks.
+    #[cfg(feature = "quickwit")]
+    pub async fn docs_async<D: DocumentDeserialize>(
+        &self,
+        doc_addresses: &BTreeSet<DocAddress>,
+    ) -> crate::Result<HashMap<DocAddress, D>> {
+        let mut segment_ord_set = Vec::<&DocAddress>::new();
+        let mut futures = futures_util::stream::FuturesUnordered::new();
+
+        // Helper function that creates a future to fetch the docs for a set of `DocAddress`es that
+        // all have the same segment ordinal.
+        let get_docs_for_segment_ord_set = |doc_addrs: &mut Vec<&DocAddress>| {
+            let segment_ord = doc_addrs.first().unwrap().segment_ord;
+            let store_reader = &self.inner.store_readers[segment_ord as usize];
+
+            let doc_ids = std::mem::take(doc_addrs)
+                .into_iter()
+                .map(|doc_address| doc_address.doc_id)
+                .collect();
+
+            let get_docs_future = || async move {
+                let docs = store_reader.get_many_async(doc_ids).await?;
+                Ok::<_, crate::TantivyError>(docs.into_iter().map(move |(doc_id, doc)| {
+                    (
+                        DocAddress {
+                            segment_ord,
+                            doc_id,
+                        },
+                        doc,
+                    )
+                }))
+            };
+
+            futures.push(get_docs_future());
+        };
+
+        for doc_addr in doc_addresses {
+            if let Some(cur_set_doc_addr) = segment_ord_set.first() {
+                if doc_addr.segment_ord != cur_set_doc_addr.segment_ord {
+                    // The new `doc_addr`'s segment ordinal doesn't match that of the current set,
+                    // so grab the set's docs and then start a new set.
+                    get_docs_for_segment_ord_set(&mut segment_ord_set);
+                }
+            }
+
+            segment_ord_set.push(doc_addr);
+        }
+
+        if !segment_ord_set.is_empty() {
+            get_docs_for_segment_ord_set(&mut segment_ord_set);
+        }
+
+        // Debug assert to ensure that all `DocAddress`es were processed.
+        debug_assert!(segment_ord_set.is_empty());
+
+        let mut results = HashMap::<DocAddress, D>::with_capacity(doc_addresses.len());
+        while let Some(docs_result) = futures.next().await {
+            results.extend(docs_result?);
+        }
+
+        Ok(results)
+    }
+
     /// Access the schema associated with the index of this searcher.
     pub fn schema(&self) -> &Schema {
         &self.inner.schema

diff --git a/src/core/tests.rs b/src/core/tests.rs
@@ -1,13 +1,17 @@
+use std::collections::BTreeSet;
+
 use crate::collector::Count;
 use crate::directory::{RamDirectory, WatchCallback};
 use crate::indexer::{LogMergePolicy, NoMergePolicy};
 use crate::json_utils::JsonTermWriter;
 use crate::query::TermQuery;
-use crate::schema::{Field, IndexRecordOption, Schema, Type, INDEXED, STRING, TEXT};
+use crate::schema::{
+    Field, IndexRecordOption, OwnedValue, Schema, Type, INDEXED, STORED, STRING, TEXT,
+};
 use crate::tokenizer::TokenizerManager;
 use crate::{
-    Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings,
-    ReloadPolicy, SegmentId, TantivyDocument, Term,
+    Directory, DocAddress, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter,
+    Postings, ReloadPolicy, SegmentId, TantivyDocument, Term,
 };
 
 #[test]
@@ -474,3 +478,46 @@ fn test_non_text_json_term_freq_bitpacked() {
         assert_eq!(postings.term_freq(), 1u32);
     }
 }
+
+#[test]
+fn test_get_many_docs() -> crate::Result<()> {
+    let mut schema_builder = Schema::builder();
+    let num_field = schema_builder.add_u64_field("num", STORED);
+    let schema = schema_builder.build();
+    let index = Index::create_in_ram(schema);
+    let mut index_writer: IndexWriter = index.writer_for_tests()?;
+    index_writer.set_merge_policy(Box::new(NoMergePolicy));
+    for i in 0..10u64 {
+        let doc = doc!(num_field=>i);
+        index_writer.add_document(doc)?;
+    }
+
+    index_writer.commit()?;
+    let segment_ids = index.searchable_segment_ids()?;
+    index_writer.merge(&segment_ids).wait().unwrap();
+
+    let searcher = index.reader()?.searcher();
+    assert_eq!(searcher.num_docs(), 10);
+
+    let doc_addresses = (0..10)
+        .map(|i| DocAddress::new(0u32, i))
+        .collect::<BTreeSet<_>>();
+
+    let docs = searcher.docs::<TantivyDocument>(&doc_addresses)?;
+    let mut doc_nums = Vec::new();
+
+    for (_doc_addr, doc) in docs {
+        let num_value = doc.get_first(num_field).unwrap();
+
+        if let OwnedValue::U64(num) = num_value {
+            doc_nums.push(*num);
+        } else {
+            panic!("Expected u64 value");
+        }
+    }
+
+    doc_nums.sort();
+    assert_eq!(doc_nums, (0..10).collect::<Vec<u64>>());
+
+    Ok(())
+}
diff --git a/src/store/reader.rs b/src/store/reader.rs
@@ -1,3 +1,4 @@
+use std::collections::{BTreeSet, HashMap};
 use std::io;
 use std::iter::Sum;
 use std::num::NonZeroUsize;
@@ -6,6 +7,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 
 use common::{BinarySerializable, OwnedBytes};
+#[cfg(feature = "quickwit")]
+use futures_util::StreamExt;
 use lru::LruCache;
 
 use super::footer::DocStoreFooter;
@@ -206,6 +209,44 @@ impl StoreReader {
         D::deserialize(deserializer).map_err(crate::TantivyError::from)
     }
 
+    /// Reads a set of given documents.
+    ///
+    /// Calling [`get_many`](Self::get_many) is more efficient than calling [`get`](Self::get)
+    /// multiple times, as it will only read and decompress a block once for all documents within a
+    /// block.
+    pub fn get_many<D: DocumentDeserialize>(
+        &self,
+        mut doc_ids: BTreeSet<DocId>,
+    ) -> crate::Result<HashMap<DocId, D>> {
+        let mut results = HashMap::with_capacity(doc_ids.len());
+
+        // Helper function to deserialize a document from bytes.
+        let deserialize_from_bytes = |doc_bytes: &mut OwnedBytes| {
+            let deserializer = BinaryDocumentDeserializer::from_reader(doc_bytes)
+                .map_err(crate::TantivyError::from)?;
+            D::deserialize(deserializer).map_err(crate::TantivyError::from)
+        };
+
+        while let Some(doc_id) = doc_ids.pop_last() {
+            let checkpoint = self.block_checkpoint(doc_id)?;
+            let block = self.read_block(&checkpoint)?;
+            let mut doc_bytes =
+                Self::get_document_bytes_from_block(block.clone(), doc_id, &checkpoint)?;
+
+            results.insert(doc_id, deserialize_from_bytes(&mut doc_bytes)?);
+
+            // Split off all doc ids that are in the same block and read them in as well.
+            let additional_doc_ids = doc_ids.split_off(&checkpoint.doc_range.start);
+            for doc_id in additional_doc_ids {
+                let mut doc_bytes =
+                    Self::get_document_bytes_from_block(block.clone(), doc_id, &checkpoint)?;
+                results.insert(doc_id, deserialize_from_bytes(&mut doc_bytes)?);
+            }
+        }
+
+        Ok(results)
+    }
+
     /// Returns raw bytes of a given document.
     ///
     /// Calling `.get(doc)` is relatively costly as it requires
@@ -377,6 +418,50 @@ impl StoreReader {
             .map_err(crate::TantivyError::from)?;
         D::deserialize(deserializer).map_err(crate::TantivyError::from)
     }
+
+    /// Fetches a set of documents asynchronously. Async version of [`get_many`](Self::get_many),
+    /// except that it may read blocks in parallel.
+    pub async fn get_many_async<D: DocumentDeserialize>(
+        &self,
+        mut doc_ids: BTreeSet<DocId>,
+    ) -> crate::Result<HashMap<DocId, D>> {
+        // Helper function to deserialize a document from bytes.
+        let deserialize_from_bytes = |doc_bytes: &mut OwnedBytes| {
+            let deserializer = BinaryDocumentDeserializer::from_reader(doc_bytes)
+                .map_err(crate::TantivyError::from)?;
+            D::deserialize(deserializer).map_err(crate::TantivyError::from)
+        };
+
+        let mut read_block_futures = futures_util::stream::FuturesUnordered::new();
+
+        // Spawn a future for each block to read.
+        while let Some(doc_id) = doc_ids.pop_last() {
+            let checkpoint = self.block_checkpoint(doc_id)?;
+
+            let mut checkpoint_doc_ids = doc_ids.split_off(&checkpoint.doc_range.start);
+            checkpoint_doc_ids.insert(doc_id);
+
+            let read_block_future = || async move {
+                let block = self.read_block_async(&checkpoint).await?;
+                Ok::<_, io::Error>((block, checkpoint, checkpoint_doc_ids))
+            };
+            read_block_futures.push(read_block_future());
+        }
+
+        let mut results = HashMap::with_capacity(doc_ids.len());
+
+        while let Some(read_block_result) = read_block_futures.next().await {
+            let (block, checkpoint, checkpoint_doc_ids) = read_block_result?;
+
+            for doc_id in checkpoint_doc_ids {
+                let mut doc_bytes =
+                    Self::get_document_bytes_from_block(block.clone(), doc_id, &checkpoint)?;
+                results.insert(doc_id, deserialize_from_bytes(&mut doc_bytes)?);
+            }
+        }
+
+        Ok(results)
+    }
 }
 
 #[cfg(test)]