From bb1d227447647ee083afc8ee466222c858f98043 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 31 Aug 2022 11:59:36 +0900 Subject: [PATCH] Adding dragon API to build index without any thread. Closes #1487 --- common/src/writer.rs | 2 +- src/core/index.rs | 50 ++++++++++++++++++++++++- src/core/mod.rs | 2 + src/core/single_segment_index_writer.rs | 47 +++++++++++++++++++++++ src/lib.rs | 2 +- src/postings/postings_writer.rs | 2 +- src/postings/recorder.rs | 2 +- 7 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 src/core/single_segment_index_writer.rs diff --git a/common/src/writer.rs b/common/src/writer.rs index 9b8b86908d..c0f4f297ed 100644 --- a/common/src/writer.rs +++ b/common/src/writer.rs @@ -62,7 +62,7 @@ impl TerminatingWrite for CountingWriter { pub struct AntiCallToken(()); /// Trait used to indicate when no more write need to be done on a writer -pub trait TerminatingWrite: Write + Send { +pub trait TerminatingWrite: Write + Send + Sync { /// Indicate that the writer will no longer be used. Internally call terminate_ref. fn terminate(mut self) -> io::Result<()> where Self: Sized { diff --git a/src/core/index.rs b/src/core/index.rs index f3af70ef5a..7c69089484 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use super::segment::Segment; use super::IndexSettings; +use crate::core::single_segment_index_writer::SingleSegmentIndexWriter; use crate::core::{ Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH, }; @@ -163,6 +164,25 @@ impl IndexBuilder { self.create(mmap_directory) } + /// Dragons ahead!!! + /// + /// The point of this API is to let users create a simple index with a single segment + /// and without starting any thread. + /// + /// Do not use this method if you are not sure what you are doing. + /// + /// It expects an originally empty directory, and will not run any GC operation. + #[doc(hidden)] + pub fn single_segment_index_writer( + self, + dir: impl Into>, + mem_budget: usize, + ) -> crate::Result { + let index = self.create(dir)?; + let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?; + Ok(index_simple_writer) + } + /// Creates a new index in a temp directory. /// /// The index will use the `MMapDirectory` in a newly created directory. @@ -608,10 +628,12 @@ impl fmt::Debug for Index { #[cfg(test)] mod tests { + use crate::collector::Count; use crate::directory::{RamDirectory, WatchCallback}; - use crate::schema::{Field, Schema, INDEXED, TEXT}; + use crate::query::TermQuery; + use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT}; use crate::tokenizer::TokenizerManager; - use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy}; + use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term}; #[test] fn test_indexer_for_field() { @@ -877,4 +899,28 @@ mod tests { ); Ok(()) } + + #[test] + fn test_single_segment_index_writer() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let directory = RamDirectory::default(); + let mut single_segment_index_writer = Index::builder() + .schema(schema) + .single_segment_index_writer(directory, 10_000_000)?; + for _ in 0..10 { + let doc = doc!(text_field=>"hello"); + single_segment_index_writer.add_document(doc)?; + } + let index = single_segment_index_writer.finalize()?; + let searcher = index.reader()?.searcher(); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "hello"), + IndexRecordOption::Basic, + ); + let count = searcher.search(&term_query, &Count)?; + assert_eq!(count, 10); + Ok(()) + } } diff --git a/src/core/mod.rs b/src/core/mod.rs index 6ebb652473..983e5c3b47 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,6 +7,7 @@ mod segment; mod segment_component; mod segment_id; mod segment_reader; +mod single_segment_index_writer; use std::path::Path; @@ -23,6 +24,7 @@ pub use self::segment::Segment; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; pub use self::segment_reader::SegmentReader; +pub use self::single_segment_index_writer::SingleSegmentIndexWriter; /// The meta file contains all the information about the list of segments and the schema /// of the index. diff --git a/src/core/single_segment_index_writer.rs b/src/core/single_segment_index_writer.rs new file mode 100644 index 0000000000..e1a3254aec --- /dev/null +++ b/src/core/single_segment_index_writer.rs @@ -0,0 +1,47 @@ +use crate::indexer::operation::AddOperation; +use crate::indexer::segment_updater::save_metas; +use crate::indexer::SegmentWriter; +use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment}; + +#[doc(hidden)] +pub struct SingleSegmentIndexWriter { + segment_writer: SegmentWriter, + segment: Segment, + opstamp: Opstamp, +} + +impl SingleSegmentIndexWriter { + pub fn new(index: Index, mem_budget: usize) -> crate::Result { + let segment = index.new_segment(); + let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?; + Ok(Self { + segment_writer, + segment, + opstamp: 0, + }) + } + + pub fn add_document(&mut self, document: Document) -> crate::Result<()> { + let opstamp = self.opstamp; + self.opstamp += 1; + self.segment_writer + .add_document(AddOperation { opstamp, document }) + } + + pub fn finalize(self) -> crate::Result { + let max_doc = self.segment_writer.max_doc(); + self.segment_writer.finalize()?; + let segment: Segment = self.segment.with_max_doc(max_doc); + let index = segment.index(); + let index_meta = IndexMeta { + index_settings: index.settings().clone(), + segments: vec![segment.meta().clone()], + schema: index.schema(), + opstamp: 0, + payload: None, + }; + save_metas(&index_meta, index.directory())?; + index.directory().sync_directory()?; + Ok(segment.index().clone()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 40ff110280..a1e48051cb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED}; pub use crate::core::{ Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta, - SegmentReader, + SegmentReader, SingleSegmentIndexWriter, }; pub use crate::directory::Directory; pub use crate::indexer::demuxer::*; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index eec2af2631..84c95739e6 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition { /// and building a `Segment` in anonymous memory. /// /// `PostingsWriter` writes in a `MemoryArena`. -pub(crate) trait PostingsWriter { +pub(crate) trait PostingsWriter: Send + Sync { /// Record that a document contains a term at a given position. /// /// * doc - the document id diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 578eeb9e84..d3bda8e727 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> { /// * the document id /// * the term frequency /// * the term positions -pub(crate) trait Recorder: Copy + Default + 'static { +pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static { /// Returns the current document fn current_doc(&self) -> u32; /// Starts recording information about a new document