From 5f08b1a20313cc021430ac8da9456b2c67f0e197 Mon Sep 17 00:00:00 2001 From: phiresky Date: Tue, 18 May 2021 15:31:02 +0200 Subject: [PATCH 1/9] dynamic read implementation --- Cargo.toml | 2 +- src/directory/file_slice.rs | 10 ++++-- src/directory/mod.rs | 2 ++ src/directory/on_demand_bytes.rs | 34 ++++++++++++++++++++ src/lib.rs | 1 + src/termdict/fst_termdict/streamer.rs | 4 +-- src/termdict/fst_termdict/term_info_store.rs | 1 + src/termdict/fst_termdict/termdict.rs | 23 +++++++++---- 8 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 src/directory/on_demand_bytes.rs diff --git a/Cargo.toml b/Cargo.toml index e7130b47e8..0f1460a31f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ byteorder = "1" crc32fast = "1" once_cell = "1" regex ={version = "1", default-features = false, features = ["std"]} -tantivy-fst = "0.3" +tantivy-fst = {version="0.3", path="../tantivy-fst"} memmap = {version = "0.7", optional=true} lz4 = {version="1", optional=true} brotli = {version="3.3.0", optional=true} diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index cc2b97aa6a..285432f469 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -4,6 +4,8 @@ use crate::common::HasLen; use crate::directory::OwnedBytes; use std::sync::{Arc, Weak}; use std::{io, ops::Deref}; +use std::fmt::Debug; +use super::OnDemandBytes; pub type ArcBytes = Arc + Send + Sync + 'static>; pub type WeakArcBytes = Weak + Send + Sync + 'static>; @@ -16,7 +18,7 @@ pub type WeakArcBytes = Weak + Send + Sync + 'static>; /// The underlying behavior is therefore specific to the `Directory` that created it. /// Despite its name, a `FileSlice` may or may not directly map to an actual file /// on the filesystem. -pub trait FileHandle: 'static + Send + Sync + HasLen { +pub trait FileHandle: 'static + Send + Sync + HasLen + Debug { /// Reads a slice of bytes. /// /// This method may panic if the range requested is invalid. @@ -49,7 +51,7 @@ where // /// It can be cloned and sliced cheaply. /// -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct FileSlice { data: Arc, start: usize, @@ -104,6 +106,10 @@ impl FileSlice { self.data.read_bytes(self.start, self.stop) } + pub fn read_ondemand(&self) -> io::Result { + Ok(OnDemandBytes::new(self.data.clone())) + } + /// Reads a specific slice of data. /// /// This is equivalent to running `file_slice.slice(from, to).read_bytes()`. diff --git a/src/directory/mod.rs b/src/directory/mod.rs index 8bd2c31852..f1dda7f786 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -14,6 +14,7 @@ mod file_watcher; mod footer; mod managed_directory; mod owned_bytes; +mod on_demand_bytes; mod ram_directory; mod watch_event_router; @@ -26,6 +27,7 @@ pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes}; pub use self::file_slice::{FileHandle, FileSlice}; pub use self::owned_bytes::OwnedBytes; +pub use self::on_demand_bytes::{OnDemandBytes, OnDemandBox}; pub use self::ram_directory::RAMDirectory; pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; use std::io::{self, BufWriter, Write}; diff --git a/src/directory/on_demand_bytes.rs b/src/directory/on_demand_bytes.rs new file mode 100644 index 0000000000..389707c2f0 --- /dev/null +++ b/src/directory/on_demand_bytes.rs @@ -0,0 +1,34 @@ +use std::{ops::Deref, sync::Arc}; + +use tantivy_fst::{FakeArr, FakeArrPart, ShRange}; + +use super::FileHandle; + +pub type OnDemandBox = Box; +#[derive(Debug)] +pub struct OnDemandBytes { + file: Arc +} + +impl OnDemandBytes { + pub fn new(fh: Arc) -> OnDemandBytes { + OnDemandBytes { + file: fh + } + } +} +impl FakeArr for OnDemandBytes { + fn len(&self) -> usize { + self.file.len() + } + + fn read_into(&self, offset: usize, buf: &mut [u8]) -> std::io::Result<()> { + let bytes = self.file.read_bytes(offset, offset + buf.len())?; + buf.copy_from_slice(&bytes[..]); + Ok(()) + } + + fn as_dyn(&self) -> &dyn FakeArr { + self + } +} diff --git a/src/lib.rs b/src/lib.rs index 39e21c27ff..57709ca08c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -737,6 +737,7 @@ mod tests { let searcher = index_reader.searcher(); let reader = searcher.segment_reader(0); let inverted_index = reader.inverted_index(text_field)?; + println!("terms: {:?}", inverted_index.terms()); let term_abcd = Term::from_field_text(text_field, "abcd"); assert!(inverted_index .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)? diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs index 66ce02c2ab..f187f9c49b 100644 --- a/src/termdict/fst_termdict/streamer.rs +++ b/src/termdict/fst_termdict/streamer.rs @@ -3,7 +3,7 @@ use std::io; use super::TermDictionary; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; -use tantivy_fst::automaton::AlwaysMatch; +use tantivy_fst::{FakeArr, automaton::AlwaysMatch}; use tantivy_fst::map::{Stream, StreamBuilder}; use tantivy_fst::Automaton; use tantivy_fst::{IntoStreamer, Streamer}; @@ -95,7 +95,7 @@ where pub fn advance(&mut self) -> bool { if let Some((term, term_ord)) = self.stream.next() { self.current_key.clear(); - self.current_key.extend_from_slice(term); + self.current_key.extend_from_slice(&term.to_vec()); self.term_ord = term_ord; self.current_value = self.fst_map.term_info_from_ord(term_ord); true diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 20b709a2f9..fa33eef4f2 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -85,6 +85,7 @@ impl TermInfoBlockMeta { } } +#[derive(Debug)] pub struct TermInfoStore { num_terms: usize, block_meta_bytes: OwnedBytes, diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index ff0d4ec5f5..98263cc5f5 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -1,13 +1,14 @@ use super::term_info_store::{TermInfoStore, TermInfoStoreWriter}; use super::{TermStreamer, TermStreamerBuilder}; -use crate::common::{BinarySerializable, CountingWriter}; +use crate::{common::{BinarySerializable, CountingWriter}, directory::OnDemandBox}; use crate::directory::{FileSlice, OwnedBytes}; use crate::error::DataCorruption; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; +use crate::directory::OnDemandBytes; use once_cell::sync::Lazy; use std::io::{self, Write}; -use tantivy_fst::raw::Fst; +use tantivy_fst::{FakeArr, raw::Fst}; use tantivy_fst::Automaton; fn convert_fst_error(e: tantivy_fst::Error) -> io::Error { @@ -85,13 +86,17 @@ where } } -fn open_fst_index(fst_file: FileSlice) -> crate::Result> { - let bytes = fst_file.read_bytes()?; +fn open_fst_index(fst_file: FileSlice) -> crate::Result> { + println!("open_fst_index()"); + let bytes = fst_file.read_ondemand()?; let fst = Fst::new(bytes) .map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?; - Ok(tantivy_fst::Map::from(fst)) + let ret = Ok(tantivy_fst::Map::from(fst)); + println!("open_fst_index RET"); + return ret; } + static EMPTY_TERM_DICT_FILE: Lazy = Lazy::new(|| { let term_dictionary_data: Vec = TermDictionaryBuilder::create(Vec::::new()) .expect("Creating a TermDictionaryBuilder in a Vec should never fail") @@ -106,8 +111,9 @@ static EMPTY_TERM_DICT_FILE: Lazy = Lazy::new(|| { /// The `Fst` crate is used to associate terms to their /// respective `TermOrdinal`. The `TermInfoStore` then makes it /// possible to fetch the associated `TermInfo`. +#[derive(Debug)] pub struct TermDictionary { - fst_index: tantivy_fst::Map, + fst_index: tantivy_fst::Map, term_info_store: TermInfoStore, } @@ -139,7 +145,10 @@ impl TermDictionary { /// Returns the ordinal associated to a given term. pub fn term_ord>(&self, key: K) -> io::Result> { - Ok(self.fst_index.get(key)) + println!("termdict.term_ord({:?})", String::from_utf8_lossy(key.as_ref())); + let ret = Ok(self.fst_index.get(key)); + println!("termdict.term_ord RET"); + return ret; } /// Returns the term associated to a given term ordinal. From 866a11c3e19c4999c910cf334c00f52fb3b0ab9f Mon Sep 17 00:00:00 2001 From: phiresky Date: Tue, 18 May 2021 17:09:37 +0200 Subject: [PATCH 2/9] works! --- src/directory/file_slice.rs | 26 +++++++++++++++++++++----- src/directory/on_demand_bytes.rs | 2 ++ src/directory/owned_bytes.rs | 23 +++++++++++++++++++++-- src/directory/ram_directory.rs | 10 ++-------- src/directory/watch_event_router.rs | 9 +++++++-- src/lib.rs | 6 ++++-- src/termdict/fst_termdict/termdict.rs | 8 ++++---- 7 files changed, 61 insertions(+), 23 deletions(-) diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index 285432f469..313666f90a 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -1,11 +1,12 @@ use stable_deref_trait::StableDeref; +use tantivy_fst::FakeArr; +use super::OnDemandBytes; use crate::common::HasLen; use crate::directory::OwnedBytes; +use std::fmt::Debug; use std::sync::{Arc, Weak}; use std::{io, ops::Deref}; -use std::fmt::Debug; -use super::OnDemandBytes; pub type ArcBytes = Arc + Send + Sync + 'static>; pub type WeakArcBytes = Weak + Send + Sync + 'static>; @@ -25,6 +26,21 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + Debug { fn read_bytes(&self, from: usize, to: usize) -> io::Result; } +impl FakeArr for FileSlice { + fn len(&self) -> usize { + self.stop - self.start + } + + fn read_into(&self, offset: usize, buf: &mut [u8]) -> io::Result<()> { + buf.copy_from_slice(&self.read_bytes_slice(offset, offset + buf.len())?); + Ok(()) + } + + fn as_dyn(&self) -> &dyn FakeArr { + self + } +} + impl FileHandle for &'static [u8] { fn read_bytes(&self, from: usize, to: usize) -> io::Result { let bytes = &self[from..to]; @@ -81,7 +97,7 @@ impl FileSlice { /// /// Panics if `to < from` or if `to` exceeds the filesize. pub fn slice(&self, from: usize, to: usize) -> FileSlice { - assert!(to <= self.len()); + assert!(to <= ::len(&self)); assert!(to >= from); FileSlice { data: self.data.clone(), @@ -135,7 +151,7 @@ impl FileSlice { /// Splits the file slice at the given offset and return two file slices. /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. pub fn split_from_end(self, right_len: usize) -> (FileSlice, FileSlice) { - let left_len = self.len() - right_len; + let left_len = HasLen::len(&self) - right_len; self.split(left_len) } @@ -144,7 +160,7 @@ impl FileSlice { /// /// Equivalent to `.slice(from_offset, self.len())` pub fn slice_from(&self, from_offset: usize) -> FileSlice { - self.slice(from_offset, self.len()) + self.slice(from_offset, ::len(&self)) } /// Like `.slice(...)` but enforcing only the `to` diff --git a/src/directory/on_demand_bytes.rs b/src/directory/on_demand_bytes.rs index 389707c2f0..42426ca71f 100644 --- a/src/directory/on_demand_bytes.rs +++ b/src/directory/on_demand_bytes.rs @@ -23,7 +23,9 @@ impl FakeArr for OnDemandBytes { } fn read_into(&self, offset: usize, buf: &mut [u8]) -> std::io::Result<()> { + assert!(offset + buf.len() <= self.len(), "{} <= {}", offset + buf.len(), self.len()); let bytes = self.file.read_bytes(offset, offset + buf.len())?; + assert_eq!(buf.len(), bytes.len()); buf.copy_from_slice(&bytes[..]); Ok(()) } diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 73303f50cc..93cefced10 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -1,5 +1,6 @@ use crate::directory::FileHandle; use stable_deref_trait::StableDeref; +//use tantivy_fst::FakeArr; use std::convert::TryInto; use std::mem; use std::ops::Deref; @@ -22,6 +23,24 @@ impl FileHandle for OwnedBytes { } } + +/*impl FakeArr for OwnedBytes { + fn len(&self) -> usize { + self.data.len() + } + + fn read_into(&self, offset: usize, buf: &mut [u8]) -> std::io::Result<()> { + let bytes = self.read_bytes(offset, offset + buf.len())?; + buf.copy_from_slice(&bytes[..]); + Ok(()) + } + + fn as_dyn(&self) -> &dyn FakeArr { + self + } +}*/ + + impl OwnedBytes { /// Creates an empty `OwnedBytes`. pub fn empty() -> OwnedBytes { @@ -120,8 +139,8 @@ impl fmt::Debug for OwnedBytes { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We truncate the bytes in order to make sure the debug string // is not too long. - let bytes_truncated: &[u8] = if self.len() > 8 { - &self.as_slice()[..10] + let bytes_truncated: &[u8] = if self.len() > 1000 { + &self.as_slice()[..1000] } else { self.as_slice() }; diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index f5ddcd955e..589247e451 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -80,7 +80,7 @@ impl TerminatingWrite for VecWriter { } } -#[derive(Default)] +#[derive(Default, Debug)] struct InnerDirectory { fs: HashMap, watch_router: WatchCallbackList, @@ -119,18 +119,12 @@ impl InnerDirectory { } } -impl fmt::Debug for RAMDirectory { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "RAMDirectory") - } -} - /// A Directory storing everything in anonymous memory. /// /// It is mainly meant for unit testing. /// Writes are only made visible upon flushing. /// -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct RAMDirectory { fs: Arc>, } diff --git a/src/directory/watch_event_router.rs b/src/directory/watch_event_router.rs index c42d03be30..72160dee95 100644 --- a/src/directory/watch_event_router.rs +++ b/src/directory/watch_event_router.rs @@ -3,11 +3,16 @@ use futures::{Future, TryFutureExt}; use std::sync::Arc; use std::sync::RwLock; use std::sync::Weak; +use std::fmt::Debug; /// Cloneable wrapper for callbacks registered when watching files of a `Directory`. #[derive(Clone)] pub struct WatchCallback(Arc); - +impl Debug for WatchCallback { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Ok(()) + } +} impl WatchCallback { /// Wraps a `Fn()` to create a WatchCallback. pub fn new(op: F) -> Self { @@ -23,7 +28,7 @@ impl WatchCallback { /// /// It registers callbacks (See `.subscribe(...)`) and /// calls them upon calls to `.broadcast(...)`. -#[derive(Default)] +#[derive(Default, Debug)] pub struct WatchCallbackList { router: RwLock>>, } diff --git a/src/lib.rs b/src/lib.rs index 57709ca08c..4f985785fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -282,7 +282,7 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId); #[cfg(test)] mod tests { - use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; + use crate::{Directory, collector::tests::TEST_COLLECTOR_WITH_SCORE}; use crate::core::SegmentReader; use crate::docset::{DocSet, TERMINATED}; use crate::query::BooleanQuery; @@ -633,10 +633,12 @@ mod tests { let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; - let negative_val = -1i64; + let negative_val = 242i64; index_writer.add_document(doc!(value_field => negative_val)); index_writer.commit()?; + println!("dir: {:#?}", index.directory()); let reader = index.reader()?; + let searcher = reader.searcher(); let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index 98263cc5f5..9be4543c16 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -86,10 +86,9 @@ where } } -fn open_fst_index(fst_file: FileSlice) -> crate::Result> { +fn open_fst_index(fst_file: FileSlice) -> crate::Result> { println!("open_fst_index()"); - let bytes = fst_file.read_ondemand()?; - let fst = Fst::new(bytes) + let fst = Fst::new(fst_file) .map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?; let ret = Ok(tantivy_fst::Map::from(fst)); println!("open_fst_index RET"); @@ -97,6 +96,7 @@ fn open_fst_index(fst_file: FileSlice) -> crate::Result = Lazy::new(|| { let term_dictionary_data: Vec = TermDictionaryBuilder::create(Vec::::new()) .expect("Creating a TermDictionaryBuilder in a Vec should never fail") @@ -113,7 +113,7 @@ static EMPTY_TERM_DICT_FILE: Lazy = Lazy::new(|| { /// possible to fetch the associated `TermInfo`. #[derive(Debug)] pub struct TermDictionary { - fst_index: tantivy_fst::Map, + fst_index: tantivy_fst::Map, term_info_store: TermInfoStore, } From 0671d2426fd9829f0c9e7636dff941f430fdfe75 Mon Sep 17 00:00:00 2001 From: phiresky Date: Wed, 19 May 2021 21:54:44 +0200 Subject: [PATCH 3/9] better --- Cargo.toml | 1 + src/core/inverted_index_reader.rs | 2 +- src/directory/file_slice.rs | 9 +- src/directory/fs_directory.rs | 165 +++++++++++++++++++ src/directory/mod.rs | 6 +- src/directory/on_demand_bytes.rs | 36 ---- src/fieldnorm/reader.rs | 17 +- src/postings/block_segment_postings.rs | 59 ++++--- src/postings/compression/mod.rs | 9 +- src/postings/compression/vint.rs | 10 +- src/postings/skip.rs | 14 +- src/termdict/fst_termdict/term_info_store.rs | 35 ++-- src/termdict/fst_termdict/termdict.rs | 5 +- 13 files changed, 255 insertions(+), 113 deletions(-) create mode 100644 src/directory/fs_directory.rs delete mode 100644 src/directory/on_demand_bytes.rs diff --git a/Cargo.toml b/Cargo.toml index 0f1460a31f..3654021c3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ chrono = "0.4" smallvec = "1" rayon = "1" lru = "0.6" +backtrace = "0.3.59" [target.'cfg(windows)'.dependencies] winapi = "0.3" diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 2f4edf76d4..9c4a1d049c 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -93,7 +93,7 @@ impl InvertedIndexReader { let start_offset = term_info.postings_start_offset as usize; let stop_offset = term_info.postings_stop_offset as usize; let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset); - block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?); + block_postings.reset(term_info.doc_freq, postings_slice); Ok(()) } diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index 313666f90a..b460cd0a04 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -1,7 +1,6 @@ use stable_deref_trait::StableDeref; -use tantivy_fst::FakeArr; +pub use tantivy_fst::FakeArr; -use super::OnDemandBytes; use crate::common::HasLen; use crate::directory::OwnedBytes; use std::fmt::Debug; @@ -122,10 +121,6 @@ impl FileSlice { self.data.read_bytes(self.start, self.stop) } - pub fn read_ondemand(&self) -> io::Result { - Ok(OnDemandBytes::new(self.data.clone())) - } - /// Reads a specific slice of data. /// /// This is equivalent to running `file_slice.slice(from, to).read_bytes()`. @@ -133,7 +128,7 @@ impl FileSlice { assert!(from <= to); assert!( self.start + to <= self.stop, - "`to` exceeds the fileslice length" + "`to` exceeds the fileslice length, {}, {}, {}", self.start, to, self.stop ); self.data.read_bytes(self.start + from, self.start + to) } diff --git a/src/directory/fs_directory.rs b/src/directory/fs_directory.rs new file mode 100644 index 0000000000..e409fdd5bb --- /dev/null +++ b/src/directory/fs_directory.rs @@ -0,0 +1,165 @@ +use std::{ + collections::BTreeMap, + fs::File, + io::{BufWriter, Read, Seek, SeekFrom, Write}, + ops::DerefMut, + path::{Path, PathBuf}, + sync::{Arc, RwLock}, +}; + +use crate::{ + directory::{error::OpenWriteError, FileHandle, OwnedBytes, TerminatingWrite, WatchHandle}, + Directory, HasLen, +}; + +use super::{ + error::{DeleteError, OpenReadError}, + AntiCallToken, WatchCallback, WritePtr, +}; + +#[derive(Debug, Clone)] +pub struct FsDirectory { + root: PathBuf, +} + +impl FsDirectory { + pub fn new(path: &Path) -> FsDirectory { + FsDirectory { + root: path.to_path_buf(), + } + } +} + +struct Noop {} +impl Write for Noop { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} +impl TerminatingWrite for Noop { + fn terminate_ref(&mut self, _: AntiCallToken) -> std::io::Result<()> { + Ok(()) + } +} +impl Directory for FsDirectory { + fn get_file_handle(&self, path: &Path) -> Result, OpenReadError> { + Ok(Box::new(FSFile::new(&self.root.join(path)))) + } + + fn delete(&self, path: &Path) -> Result<(), DeleteError> { + println!("delete {:?}", path); + Ok(()) + } + + fn exists(&self, path: &Path) -> Result { + todo!() + } + + fn open_write(&self, path: &Path) -> Result { + Ok(BufWriter::new(Box::new(Noop {}))) + } + + fn atomic_read(&self, path: &Path) -> Result, OpenReadError> { + let path = self.root.join(path); + println!("atomic_read {:?}", path); + Ok(std::fs::read(path).unwrap()) + } + + fn atomic_write(&self, path: &Path, data: &[u8]) -> std::io::Result<()> { + todo!() + } + + fn watch(&self, watch_callback: WatchCallback) -> crate::Result { + Ok(WatchHandle::empty()) + } +} + +#[derive(Debug)] +struct FSFile { + path: PathBuf, + file: Arc>, + len: usize, + cache: RwLock>>, +} +const CS: usize = 4096; + +impl FSFile { + pub fn new(path: &Path) -> FSFile { + let mut f = File::open(path).unwrap(); + let len = f.seek(SeekFrom::End(0)).unwrap(); + FSFile { + path: path.to_path_buf(), + file: Arc::new(RwLock::new(f)), + len: len as usize, + cache: RwLock::new(BTreeMap::new()), + } + } + fn read_bytes_real(&self, from: usize, to: usize) -> Vec { + let len = to - from; + + eprintln!( + "READ {} @ {}, len {}", + self.path.to_string_lossy(), + from, + len + ); + if len == 51616 { + println!("{:?}", backtrace::Backtrace::new()); + } + if len > 1_000_000 { + println!("{:?}", backtrace::Backtrace::new()); + } + if len > 2_000_000 { + panic!("tried to read too much"); + } + let mut f = self.file.write().unwrap(); + f.seek(SeekFrom::Start(from as u64)).unwrap(); + let mut buf = Vec::with_capacity(len); + let flonk = f.deref_mut(); + (flonk).take(len as u64).read_to_end(&mut buf).unwrap(); + return buf; + } +} +impl FileHandle for FSFile { + fn read_bytes(&self, from: usize, to: usize) -> std::io::Result { + let len = to - from; + eprintln!( + "GET {} @ {}, len {}", + self.path.to_string_lossy(), + from, + len + ); + let starti = from / CS; + let endi = to / CS; + let startofs = from % CS; + let endofs = to % CS; + let mut out_buf = vec![0u8; len]; + //let toget = vec![]; + let mut cache = self.cache.write().unwrap(); + let mut written = 0; + for i in starti..=endi { + let startofs = if i == starti { startofs } else { 0 }; + let endofs = if i == endi { endofs } else { CS }; + let chunk = cache.entry(i).or_insert_with(|| { + self.read_bytes_real(i * CS, std::cmp::min((i + 1) * CS, self.len())) + }); + let chunk = &chunk[startofs..endofs]; + println!("{} {} {} {}", out_buf.len(), startofs, endofs, chunk.len()); + let write_len = std::cmp::min(chunk.len(), len); + out_buf[written..written + write_len] + .copy_from_slice(&chunk); + written += write_len; + } + + Ok(OwnedBytes::new(out_buf)) + } +} +impl HasLen for FSFile { + fn len(&self) -> usize { + self.len + } +} diff --git a/src/directory/mod.rs b/src/directory/mod.rs index f1dda7f786..7361900dfb 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -14,8 +14,8 @@ mod file_watcher; mod footer; mod managed_directory; mod owned_bytes; -mod on_demand_bytes; mod ram_directory; +mod fs_directory; mod watch_event_router; /// Errors specific to the directory module. @@ -25,9 +25,8 @@ pub use self::directory::DirectoryLock; pub use self::directory::{Directory, DirectoryClone}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes}; -pub use self::file_slice::{FileHandle, FileSlice}; +pub use self::file_slice::{FileHandle, FileSlice, FakeArr}; pub use self::owned_bytes::OwnedBytes; -pub use self::on_demand_bytes::{OnDemandBytes, OnDemandBox}; pub use self::ram_directory::RAMDirectory; pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; use std::io::{self, BufWriter, Write}; @@ -49,6 +48,7 @@ pub struct GarbageCollectionResult { #[cfg(feature = "mmap")] pub use self::mmap_directory::MmapDirectory; +pub use self::fs_directory::FsDirectory; pub use self::managed_directory::ManagedDirectory; diff --git a/src/directory/on_demand_bytes.rs b/src/directory/on_demand_bytes.rs deleted file mode 100644 index 42426ca71f..0000000000 --- a/src/directory/on_demand_bytes.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::{ops::Deref, sync::Arc}; - -use tantivy_fst::{FakeArr, FakeArrPart, ShRange}; - -use super::FileHandle; - -pub type OnDemandBox = Box; -#[derive(Debug)] -pub struct OnDemandBytes { - file: Arc -} - -impl OnDemandBytes { - pub fn new(fh: Arc) -> OnDemandBytes { - OnDemandBytes { - file: fh - } - } -} -impl FakeArr for OnDemandBytes { - fn len(&self) -> usize { - self.file.len() - } - - fn read_into(&self, offset: usize, buf: &mut [u8]) -> std::io::Result<()> { - assert!(offset + buf.len() <= self.len(), "{} <= {}", offset + buf.len(), self.len()); - let bytes = self.file.read_bytes(offset, offset + buf.len())?; - assert_eq!(buf.len(), bytes.len()); - buf.copy_from_slice(&bytes[..]); - Ok(()) - } - - fn as_dyn(&self) -> &dyn FakeArr { - self - } -} diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index e1ce150076..f1f8ef805b 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,5 +1,7 @@ +use tantivy_fst::FakeArr; + use super::{fieldnorm_to_id, id_to_fieldnorm}; -use crate::common::CompositeFile; +use crate::{HasLen, common::CompositeFile}; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::schema::Field; @@ -71,7 +73,7 @@ impl From for FieldNormReader { #[derive(Clone)] enum ReaderImplEnum { - FromData(OwnedBytes), + FromData(FileSlice), Const { num_docs: u32, fieldnorm_id: u8, @@ -97,18 +99,17 @@ impl FieldNormReader { /// Opens a field norm reader given its file. pub fn open(fieldnorm_file: FileSlice) -> crate::Result { - let data = fieldnorm_file.read_bytes()?; - Ok(FieldNormReader::new(data)) + Ok(FieldNormReader::new(fieldnorm_file)) } - fn new(data: OwnedBytes) -> Self { + fn new(data: FileSlice) -> Self { ReaderImplEnum::FromData(data).into() } /// Returns the number of documents in this segment. pub fn num_docs(&self) -> u32 { match &self.0 { - ReaderImplEnum::FromData(data) => data.len() as u32, + ReaderImplEnum::FromData(data) => HasLen::len(data) as u32, ReaderImplEnum::Const { num_docs, .. } => *num_docs, } } @@ -125,7 +126,7 @@ impl FieldNormReader { pub fn fieldnorm(&self, doc_id: DocId) -> u32 { match &self.0 { ReaderImplEnum::FromData(data) => { - let fieldnorm_id = data.as_slice()[doc_id as usize]; + let fieldnorm_id = data.get_byte(doc_id as usize); id_to_fieldnorm(fieldnorm_id) } ReaderImplEnum::Const { fieldnorm, .. } => *fieldnorm, @@ -137,7 +138,7 @@ impl FieldNormReader { pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { match &self.0 { ReaderImplEnum::FromData(data) => { - let fieldnorm_id = data.as_slice()[doc_id as usize]; + let fieldnorm_id = data.get_byte(doc_id as usize); fieldnorm_id } ReaderImplEnum::Const { fieldnorm_id, .. } => *fieldnorm_id, diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 9030d8a571..41b56b41b7 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -11,6 +11,7 @@ use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; use crate::query::BM25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; +use tantivy_fst::FakeArr; fn max_score>(mut it: I) -> Option { if let Some(first) = it.next() { @@ -37,19 +38,30 @@ pub struct BlockSegmentPostings { doc_freq: u32, - data: OwnedBytes, + data: FileSlice, pub(crate) skip_reader: SkipReader, } fn decode_bitpacked_block( doc_decoder: &mut BlockDecoder, freq_decoder_opt: Option<&mut BlockDecoder>, - data: &[u8], + data: &dyn FakeArr, doc_offset: DocId, doc_num_bits: u8, tf_num_bits: u8, ) { - let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits); + let num_bytes_docs = 128 * (doc_num_bits as usize) / 8; // 128 integers per bitpacker4x block. should be same as num_consumed_bytes returned by uncompress block + let num_bytes_freqs = freq_decoder_opt.as_ref() + .map(|_| 128 * (tf_num_bits as usize) / 8) + .unwrap_or(0); + let num_toconsume_bytes = num_bytes_docs + num_bytes_freqs; + let data = data.slice((0..num_bytes_docs + num_bytes_freqs).into()).to_vec(); + let num_consumed_bytes = doc_decoder.uncompress_block_sorted(&data, doc_offset, doc_num_bits); + assert_eq!(num_bytes_docs, num_consumed_bytes); + println!( + "ofs={}, bits={}, consumed={}", + doc_offset, doc_num_bits, num_consumed_bytes + ); if let Some(freq_decoder) = freq_decoder_opt { freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits); } @@ -58,7 +70,7 @@ fn decode_bitpacked_block( fn decode_vint_block( doc_decoder: &mut BlockDecoder, freq_decoder_opt: Option<&mut BlockDecoder>, - data: &[u8], + data: &dyn FakeArr, doc_offset: DocId, num_vint_docs: usize, ) { @@ -66,22 +78,24 @@ fn decode_vint_block( doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED); if let Some(freq_decoder) = freq_decoder_opt { freq_decoder.uncompress_vint_unsorted( - &data[num_consumed_bytes..], + &data.slice((num_consumed_bytes..).into()), num_vint_docs, TERMINATED, ); } } -fn split_into_skips_and_postings( - doc_freq: u32, - mut bytes: OwnedBytes, -) -> (Option, OwnedBytes) { +fn split_into_skips_and_postings(doc_freq: u32, data: FileSlice) -> (Option, FileSlice) { if doc_freq < COMPRESSION_BLOCK_SIZE as u32 { - return (None, bytes); + return (None, data); } - let skip_len = VInt::deserialize(&mut bytes).expect("Data corrupted").0 as usize; - let (skip_data, postings_data) = bytes.split(skip_len); + // hacky code + let slice = &mut data.full_slice(); + let inx_before = slice.get_offset(); + let skip_len = VInt::deserialize(slice).expect("Data corrupted").0 as usize; + let inx_after = slice.get_offset(); + let data = data.slice_from(inx_after - inx_before); + let (skip_data, postings_data) = data.split(skip_len); (Some(skip_data), postings_data) } @@ -98,11 +112,10 @@ impl BlockSegmentPostings { (_, _) => FreqReadingOption::ReadFreq, }; - let (skip_data_opt, postings_data) = - split_into_skips_and_postings(doc_freq, data.read_bytes()?); + let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data); let skip_reader = match skip_data_opt { Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option), - None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option), + None => SkipReader::new(FileSlice::empty(), doc_freq, record_option), }; let mut block_segment_postings = BlockSegmentPostings { @@ -172,7 +185,7 @@ impl BlockSegmentPostings { // # Warning // // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) { + pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: FileSlice) { let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data); self.data = postings_data; self.block_max_score_cache = None; @@ -180,7 +193,7 @@ impl BlockSegmentPostings { if let Some(skip_data) = skip_data_opt { self.skip_reader.reset(skip_data, doc_freq); } else { - self.skip_reader.reset(OwnedBytes::empty(), doc_freq); + self.skip_reader.reset(FileSlice::empty(), doc_freq); } self.doc_freq = doc_freq; self.load_block(); @@ -297,7 +310,7 @@ impl BlockSegmentPostings { } else { None }, - &self.data.as_slice()[offset..], + &self.data.slice(offset, self.data.len()), self.skip_reader.last_doc_in_previous_block, doc_num_bits, tf_num_bits, @@ -306,9 +319,9 @@ impl BlockSegmentPostings { BlockInfo::VInt { num_docs } => { let data = { if num_docs == 0 { - &[] + FileSlice::empty() } else { - &self.data.as_slice()[offset..] + self.data.slice(offset, self.data.len()) } }; decode_vint_block( @@ -318,7 +331,7 @@ impl BlockSegmentPostings { } else { None }, - data, + &data, self.skip_reader.last_doc_in_previous_block, num_docs as usize, ); @@ -344,8 +357,8 @@ impl BlockSegmentPostings { freq_reading_option: FreqReadingOption::NoFreq, block_max_score_cache: None, doc_freq: 0, - data: OwnedBytes::empty(), - skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), + data: FileSlice::empty(), + skip_reader: SkipReader::new(FileSlice::empty(), 0, IndexRecordOption::Basic), } } } diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 5fd5d6a900..29da6de074 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -1,5 +1,6 @@ use crate::common::FixedSize; use bitpacking::{BitPacker, BitPacker4x}; +use tantivy_fst::FakeArr; pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES; @@ -148,7 +149,7 @@ pub trait VIntDecoder { /// The value given in `padding` will be used to fill the remaining `128 - num_els` values. fn uncompress_vint_sorted( &mut self, - compressed_data: &[u8], + compressed_data: &dyn FakeArr, offset: u32, num_els: usize, padding: u32, @@ -163,7 +164,7 @@ pub trait VIntDecoder { /// The value given in `padding` will be used to fill the remaining `128 - num_els` values. fn uncompress_vint_unsorted( &mut self, - compressed_data: &[u8], + compressed_data: &dyn FakeArr, num_els: usize, padding: u32, ) -> usize; @@ -182,7 +183,7 @@ impl VIntEncoder for BlockEncoder { impl VIntDecoder for BlockDecoder { fn uncompress_vint_sorted( &mut self, - compressed_data: &[u8], + compressed_data: &dyn FakeArr, offset: u32, num_els: usize, padding: u32, @@ -194,7 +195,7 @@ impl VIntDecoder for BlockDecoder { fn uncompress_vint_unsorted( &mut self, - compressed_data: &[u8], + compressed_data: &dyn FakeArr, num_els: usize, padding: u32, ) -> usize { diff --git a/src/postings/compression/vint.rs b/src/postings/compression/vint.rs index 3de43749f8..f8e7eb1533 100644 --- a/src/postings/compression/vint.rs +++ b/src/postings/compression/vint.rs @@ -1,3 +1,5 @@ +use tantivy_fst::FakeArr; + #[inline(always)] pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { let mut byte_written = 0; @@ -42,13 +44,13 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a } #[inline(always)] -pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { +pub fn uncompress_sorted(compressed_data: &dyn FakeArr, output: &mut [u32], offset: u32) -> usize { let mut read_byte = 0; let mut result = offset; for output_mut in output.iter_mut() { let mut shift = 0u32; loop { - let cur_byte = compressed_data[read_byte]; + let cur_byte = compressed_data.get_byte(read_byte); read_byte += 1; result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { @@ -62,13 +64,13 @@ pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32 } #[inline(always)] -pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize { +pub(crate) fn uncompress_unsorted(compressed_data: &dyn FakeArr, output_arr: &mut [u32]) -> usize { let mut read_byte = 0; for output_mut in output_arr.iter_mut() { let mut result = 0u32; let mut shift = 0u32; loop { - let cur_byte = compressed_data[read_byte]; + let cur_byte = compressed_data.get_byte(read_byte); read_byte += 1; result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 8d4310eb23..ba4c7d2bf1 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,6 +1,8 @@ use std::convert::TryInto; -use crate::directory::OwnedBytes; +use tantivy_fst::FakeArr; + +use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; use crate::query::BM25Weight; use crate::schema::IndexRecordOption; @@ -71,7 +73,7 @@ impl SkipSerializer { pub(crate) struct SkipReader { last_doc_in_block: DocId, pub(crate) last_doc_in_previous_block: DocId, - owned_read: OwnedBytes, + owned_read: FileSlice, skip_info: IndexRecordOption, byte_offset: usize, remaining_docs: u32, // number of docs remaining, including the @@ -102,7 +104,7 @@ impl Default for BlockInfo { } impl SkipReader { - pub fn new(data: OwnedBytes, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { + pub fn new(data: FileSlice, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { let mut skip_reader = SkipReader { last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { 0 @@ -123,7 +125,7 @@ impl SkipReader { skip_reader } - pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) { + pub fn reset(&mut self, data: FileSlice, doc_freq: u32) { self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { 0 } else { @@ -169,7 +171,7 @@ impl SkipReader { } fn read_block_info(&mut self) { - let bytes = self.owned_read.as_slice(); + let bytes = &self.owned_read.slice_to(std::cmp::min(11, self.owned_read.len())).to_vec(); let advance_len: usize; self.last_doc_in_block = read_u32(bytes); let doc_num_bits = bytes[4]; @@ -212,7 +214,7 @@ impl SkipReader { }; } } - self.owned_read.advance(advance_len); + self.owned_read = self.owned_read.slice_from(advance_len); } pub fn block_info(&self) -> BlockInfo { diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index fa33eef4f2..97f6303267 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -1,6 +1,6 @@ -use crate::common::compute_num_bits; +use crate::{HasLen, common::compute_num_bits}; use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize}; -use crate::directory::{FileSlice, OwnedBytes}; +use crate::directory::{FileSlice, FakeArr}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; @@ -58,7 +58,7 @@ impl TermInfoBlockMeta { // Here inner_offset is the offset within the block, WITHOUT the first term_info. // In other word, term_info #1,#2,#3 gets inner_offset 0,1,2... While term_info #0 // is encoded without bitpacking. - fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { + fn deserialize_term_info(&self, data: &dyn FakeArr, inner_offset: usize) -> TermInfo { assert!(inner_offset < BLOCK_LEN - 1); let num_bits = self.num_bits() as usize; @@ -88,22 +88,22 @@ impl TermInfoBlockMeta { #[derive(Debug)] pub struct TermInfoStore { num_terms: usize, - block_meta_bytes: OwnedBytes, - term_info_bytes: OwnedBytes, + block_meta_bytes: FileSlice, + term_info_bytes: FileSlice, } -fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { +fn extract_bits(data: &dyn FakeArr, addr_bits: usize, num_bits: u8) -> u64 { assert!(num_bits <= 56); let addr_byte = addr_bits / 8; let bit_shift = (addr_bits % 8) as u64; let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 { - LittleEndian::read_u64(&data[addr_byte..][..8]) + LittleEndian::read_u64(&data.slice((addr_byte..addr_byte + 8).into()).to_vec()) } else { // the buffer is not large enough. // Let's copy the few remaining bytes to a 8 byte buffer // padded with 0s. let mut buf = [0u8; 8]; - let data_to_copy = &data[addr_byte..]; + let data_to_copy = &data.slice((addr_byte..).into()).to_vec(); let nbytes = data_to_copy.len(); buf[..nbytes].copy_from_slice(data_to_copy); LittleEndian::read_u64(&buf) @@ -120,27 +120,26 @@ impl TermInfoStore { let len = u64::deserialize(&mut bytes)? as usize; let num_terms = u64::deserialize(&mut bytes)? as usize; let (block_meta_file, term_info_file) = main_slice.split(len); - let term_info_bytes = term_info_file.read_bytes()?; Ok(TermInfoStore { num_terms, - block_meta_bytes: block_meta_file.read_bytes()?, - term_info_bytes, + block_meta_bytes: block_meta_file, + term_info_bytes: term_info_file, }) } pub fn get(&self, term_ord: TermOrdinal) -> TermInfo { let block_id = (term_ord as usize) / BLOCK_LEN; - let buffer = self.block_meta_bytes.as_slice(); - let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..]; + let block_data = self.block_meta_bytes.slice(block_id * TermInfoBlockMeta::SIZE_IN_BYTES, HasLen::len(&self.block_meta_bytes)); + let mut block_data = block_data.full_slice(); let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data) .expect("Failed to deserialize terminfoblockmeta"); let inner_offset = (term_ord as usize) % BLOCK_LEN; if inner_offset == 0 { return term_info_block_data.ref_term_info; } - let term_info_data = self.term_info_bytes.as_slice(); + let term_info_data = self.term_info_bytes.slice(term_info_block_data.offset as usize, HasLen::len(&self.term_info_bytes)); term_info_block_data.deserialize_term_info( - &term_info_data[term_info_block_data.offset as usize..], + &term_info_data, inner_offset - 1, ) } @@ -304,9 +303,9 @@ mod tests { assert_eq!(compute_num_bits(51), 6); bitpack.close(&mut buffer).unwrap(); assert_eq!(buffer.len(), 3 + 7); - assert_eq!(extract_bits(&buffer[..], 0, 9), 321u64); - assert_eq!(extract_bits(&buffer[..], 9, 2), 2u64); - assert_eq!(extract_bits(&buffer[..], 11, 6), 51u64); + assert_eq!(extract_bits(buffer, 0, 9), 321u64); + assert_eq!(extract_bits(buffer, 9, 2), 2u64); + assert_eq!(extract_bits(buffer, 11, 6), 51u64); } #[test] diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index 9be4543c16..50a4f2dd5c 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -1,14 +1,13 @@ use super::term_info_store::{TermInfoStore, TermInfoStoreWriter}; use super::{TermStreamer, TermStreamerBuilder}; -use crate::{common::{BinarySerializable, CountingWriter}, directory::OnDemandBox}; +use crate::{common::{BinarySerializable, CountingWriter}}; use crate::directory::{FileSlice, OwnedBytes}; use crate::error::DataCorruption; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; -use crate::directory::OnDemandBytes; use once_cell::sync::Lazy; use std::io::{self, Write}; -use tantivy_fst::{FakeArr, raw::Fst}; +use tantivy_fst::{raw::Fst}; use tantivy_fst::Automaton; fn convert_fst_error(e: tantivy_fst::Error) -> io::Error { From 397689a88e1e52a2f158f0dfe8e3169e9c6cceb0 Mon Sep 17 00:00:00 2001 From: phiresky Date: Thu, 20 May 2021 16:00:11 +0200 Subject: [PATCH 4/9] minor fix. --- src/directory/fs_directory.rs | 10 ++++------ src/postings/block_segment_postings.rs | 5 ----- src/postings/skip.rs | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/directory/fs_directory.rs b/src/directory/fs_directory.rs index e409fdd5bb..943454f6e0 100644 --- a/src/directory/fs_directory.rs +++ b/src/directory/fs_directory.rs @@ -102,10 +102,9 @@ impl FSFile { let len = to - from; eprintln!( - "READ {} @ {}, len {}", + "READ {} chunk {}", self.path.to_string_lossy(), - from, - len + from / CS ); if len == 51616 { println!("{:?}", backtrace::Backtrace::new()); @@ -127,12 +126,12 @@ impl FSFile { impl FileHandle for FSFile { fn read_bytes(&self, from: usize, to: usize) -> std::io::Result { let len = to - from; - eprintln!( + /*eprintln!( "GET {} @ {}, len {}", self.path.to_string_lossy(), from, len - ); + );*/ let starti = from / CS; let endi = to / CS; let startofs = from % CS; @@ -148,7 +147,6 @@ impl FileHandle for FSFile { self.read_bytes_real(i * CS, std::cmp::min((i + 1) * CS, self.len())) }); let chunk = &chunk[startofs..endofs]; - println!("{} {} {} {}", out_buf.len(), startofs, endofs, chunk.len()); let write_len = std::cmp::min(chunk.len(), len); out_buf[written..written + write_len] .copy_from_slice(&chunk); diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 41b56b41b7..6633d73611 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -54,14 +54,9 @@ fn decode_bitpacked_block( let num_bytes_freqs = freq_decoder_opt.as_ref() .map(|_| 128 * (tf_num_bits as usize) / 8) .unwrap_or(0); - let num_toconsume_bytes = num_bytes_docs + num_bytes_freqs; let data = data.slice((0..num_bytes_docs + num_bytes_freqs).into()).to_vec(); let num_consumed_bytes = doc_decoder.uncompress_block_sorted(&data, doc_offset, doc_num_bits); assert_eq!(num_bytes_docs, num_consumed_bytes); - println!( - "ofs={}, bits={}, consumed={}", - doc_offset, doc_num_bits, num_consumed_bytes - ); if let Some(freq_decoder) = freq_decoder_opt { freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits); } diff --git a/src/postings/skip.rs b/src/postings/skip.rs index ba4c7d2bf1..f2e9c8bab9 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -171,7 +171,7 @@ impl SkipReader { } fn read_block_info(&mut self) { - let bytes = &self.owned_read.slice_to(std::cmp::min(11, self.owned_read.len())).to_vec(); + let bytes = &self.owned_read.slice_to(std::cmp::min(12, self.owned_read.len())).to_vec(); let advance_len: usize; self.last_doc_in_block = read_u32(bytes); let doc_num_bits = bytes[4]; From 76be25bee8ec1f74363845fd87fa4624bea5b221 Mon Sep 17 00:00:00 2001 From: phiresky Date: Thu, 20 May 2021 17:18:45 +0200 Subject: [PATCH 5/9] Ulen part 1 --- src/common/mod.rs | 4 +- src/common/serialize.rs | 3 +- src/directory/file_slice.rs | 37 ++++++++-------- src/directory/footer.rs | 5 ++- src/directory/fs_directory.rs | 46 ++++++++------------ src/directory/owned_bytes.rs | 5 ++- src/directory/ram_directory.rs | 2 +- src/postings/compression/mod.rs | 2 +- src/termdict/fst_termdict/term_info_store.rs | 3 +- 9 files changed, 51 insertions(+), 56 deletions(-) diff --git a/src/common/mod.rs b/src/common/mod.rs index 45bee6a592..24224a742c 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -14,7 +14,7 @@ pub use self::vint::{ read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, }; pub use byteorder::LittleEndian as Endianness; - +use tantivy_fst::Ulen; /// Segment's max doc must be `< MAX_DOC_LIMIT`. /// /// We do not allow segments with more than @@ -69,7 +69,7 @@ pub(crate) fn compute_num_bits(n: u64) -> u8 { /// Has length trait pub trait HasLen { /// Return length - fn len(&self) -> usize; + fn len(&self) -> Ulen; /// Returns true iff empty. fn is_empty(&self) -> bool { diff --git a/src/common/serialize.rs b/src/common/serialize.rs index 6b89bbe703..98f1896947 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -1,6 +1,7 @@ use crate::common::Endianness; use crate::common::VInt; use byteorder::{ReadBytesExt, WriteBytesExt}; +use tantivy_fst::Ulen; use std::fmt; use std::io; use std::io::Read; @@ -17,7 +18,7 @@ pub trait BinarySerializable: fmt::Debug + Sized { /// `FixedSize` marks a `BinarySerializable` as /// always serializing to the same size. pub trait FixedSize: BinarySerializable { - const SIZE_IN_BYTES: usize; + const SIZE_IN_BYTES: Ulen; } impl BinarySerializable for () { diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index b460cd0a04..3af5c2cea1 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -1,5 +1,6 @@ use stable_deref_trait::StableDeref; pub use tantivy_fst::FakeArr; +use tantivy_fst::Ulen; use crate::common::HasLen; use crate::directory::OwnedBytes; @@ -22,15 +23,15 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + Debug { /// Reads a slice of bytes. /// /// This method may panic if the range requested is invalid. - fn read_bytes(&self, from: usize, to: usize) -> io::Result; + fn read_bytes(&self, from: Ulen, to: Ulen) -> io::Result; } impl FakeArr for FileSlice { - fn len(&self) -> usize { + fn len(&self) -> Ulen { self.stop - self.start } - fn read_into(&self, offset: usize, buf: &mut [u8]) -> io::Result<()> { + fn read_into(&self, offset: Ulen, buf: &mut [u8]) -> io::Result<()> { buf.copy_from_slice(&self.read_bytes_slice(offset, offset + buf.len())?); Ok(()) } @@ -41,15 +42,15 @@ impl FakeArr for FileSlice { } impl FileHandle for &'static [u8] { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { - let bytes = &self[from..to]; + fn read_bytes(&self, from: Ulen, to: Ulen) -> io::Result { + let bytes = &self[from as usize..to as usize]; Ok(OwnedBytes::new(bytes)) } } impl> HasLen for T { - fn len(&self) -> usize { - self.as_ref().len() + fn len(&self) -> Ulen { + self.as_ref().len() as Ulen } } @@ -69,8 +70,8 @@ where #[derive(Clone, Debug)] pub struct FileSlice { data: Arc, - start: usize, - stop: usize, + start: Ulen, + stop: Ulen, } impl FileSlice { @@ -82,7 +83,7 @@ impl FileSlice { /// Wraps a FileHandle. #[doc(hidden)] - pub fn new_with_num_bytes(file_handle: Box, num_bytes: usize) -> Self { + pub fn new_with_num_bytes(file_handle: Box, num_bytes: Ulen) -> Self { FileSlice { data: Arc::from(file_handle), start: 0, @@ -95,7 +96,7 @@ impl FileSlice { /// # Panics /// /// Panics if `to < from` or if `to` exceeds the filesize. - pub fn slice(&self, from: usize, to: usize) -> FileSlice { + pub fn slice(&self, from: Ulen, to: Ulen) -> FileSlice { assert!(to <= ::len(&self)); assert!(to >= from); FileSlice { @@ -124,7 +125,7 @@ impl FileSlice { /// Reads a specific slice of data. /// /// This is equivalent to running `file_slice.slice(from, to).read_bytes()`. - pub fn read_bytes_slice(&self, from: usize, to: usize) -> io::Result { + pub fn read_bytes_slice(&self, from: Ulen, to: Ulen) -> io::Result { assert!(from <= to); assert!( self.start + to <= self.stop, @@ -137,7 +138,7 @@ impl FileSlice { /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. /// /// This operation is cheap and must not copy any underlying data. - pub fn split(self, left_len: usize) -> (FileSlice, FileSlice) { + pub fn split(self, left_len: Ulen) -> (FileSlice, FileSlice) { let left = self.slice_to(left_len); let right = self.slice_from(left_len); (left, right) @@ -145,7 +146,7 @@ impl FileSlice { /// Splits the file slice at the given offset and return two file slices. /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. - pub fn split_from_end(self, right_len: usize) -> (FileSlice, FileSlice) { + pub fn split_from_end(self, right_len: Ulen) -> (FileSlice, FileSlice) { let left_len = HasLen::len(&self) - right_len; self.split(left_len) } @@ -154,7 +155,7 @@ impl FileSlice { /// boundary. /// /// Equivalent to `.slice(from_offset, self.len())` - pub fn slice_from(&self, from_offset: usize) -> FileSlice { + pub fn slice_from(&self, from_offset: Ulen) -> FileSlice { self.slice(from_offset, ::len(&self)) } @@ -162,19 +163,19 @@ impl FileSlice { /// boundary. /// /// Equivalent to `.slice(0, to_offset)` - pub fn slice_to(&self, to_offset: usize) -> FileSlice { + pub fn slice_to(&self, to_offset: Ulen) -> FileSlice { self.slice(0, to_offset) } } impl FileHandle for FileSlice { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { + fn read_bytes(&self, from: Ulen, to: Ulen) -> io::Result { self.read_bytes_slice(from, to) } } impl HasLen for FileSlice { - fn len(&self) -> usize { + fn len(&self) -> Ulen { self.stop - self.start } } diff --git a/src/directory/footer.rs b/src/directory/footer.rs index b2f495f6cb..a0c0ed5b9d 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -4,6 +4,7 @@ use crate::directory::FileSlice; use crate::directory::{AntiCallToken, TerminatingWrite}; use crate::Version; use crc32fast::Hasher; +use tantivy_fst::Ulen; use std::io; use std::io::Write; @@ -77,9 +78,9 @@ impl Footer { ), )); } - let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES); + let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES as Ulen); let mut footer_len_bytes = footer_len_file.read_bytes()?; - let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize; + let footer_len = u32::deserialize(&mut footer_len_bytes)? as Ulen; let (body, footer) = body_footer.split_from_end(footer_len); let mut footer_bytes = footer.read_bytes()?; let footer = Footer::deserialize(&mut footer_bytes)?; diff --git a/src/directory/fs_directory.rs b/src/directory/fs_directory.rs index 943454f6e0..5a29d194cd 100644 --- a/src/directory/fs_directory.rs +++ b/src/directory/fs_directory.rs @@ -1,11 +1,6 @@ -use std::{ - collections::BTreeMap, - fs::File, - io::{BufWriter, Read, Seek, SeekFrom, Write}, - ops::DerefMut, - path::{Path, PathBuf}, - sync::{Arc, RwLock}, -}; +use std::{collections::BTreeMap, convert::TryInto, fs::File, io::{BufWriter, Read, Seek, SeekFrom, Write}, ops::DerefMut, path::{Path, PathBuf}, sync::{Arc, RwLock}}; + +use tantivy_fst::Ulen; use crate::{ directory::{error::OpenWriteError, FileHandle, OwnedBytes, TerminatingWrite, WatchHandle}, @@ -82,10 +77,10 @@ impl Directory for FsDirectory { struct FSFile { path: PathBuf, file: Arc>, - len: usize, - cache: RwLock>>, + len: Ulen, + cache: RwLock>>, } -const CS: usize = 4096; +const CS: Ulen = 4096; impl FSFile { pub fn new(path: &Path) -> FSFile { @@ -94,18 +89,14 @@ impl FSFile { FSFile { path: path.to_path_buf(), file: Arc::new(RwLock::new(f)), - len: len as usize, + len, cache: RwLock::new(BTreeMap::new()), } } - fn read_bytes_real(&self, from: usize, to: usize) -> Vec { + fn read_bytes_real(&self, from: Ulen, to: Ulen) -> Vec { let len = to - from; - eprintln!( - "READ {} chunk {}", - self.path.to_string_lossy(), - from / CS - ); + eprintln!("READ {} chunk {}", self.path.to_string_lossy(), from / CS); if len == 51616 { println!("{:?}", backtrace::Backtrace::new()); } @@ -117,15 +108,15 @@ impl FSFile { } let mut f = self.file.write().unwrap(); f.seek(SeekFrom::Start(from as u64)).unwrap(); - let mut buf = Vec::with_capacity(len); + let mut buf = Vec::with_capacity(len.try_into().unwrap()); let flonk = f.deref_mut(); (flonk).take(len as u64).read_to_end(&mut buf).unwrap(); return buf; } } impl FileHandle for FSFile { - fn read_bytes(&self, from: usize, to: usize) -> std::io::Result { - let len = to - from; + fn read_bytes(&self, from: Ulen, to: Ulen) -> std::io::Result { + let len: usize = (to - from).try_into().unwrap(); /*eprintln!( "GET {} @ {}, len {}", self.path.to_string_lossy(), @@ -134,22 +125,21 @@ impl FileHandle for FSFile { );*/ let starti = from / CS; let endi = to / CS; - let startofs = from % CS; - let endofs = to % CS; + let startofs = (from % CS) as usize; + let endofs = (to % CS) as usize; let mut out_buf = vec![0u8; len]; //let toget = vec![]; let mut cache = self.cache.write().unwrap(); let mut written = 0; for i in starti..=endi { let startofs = if i == starti { startofs } else { 0 }; - let endofs = if i == endi { endofs } else { CS }; + let endofs = if i == endi { endofs } else { CS as usize }; let chunk = cache.entry(i).or_insert_with(|| { self.read_bytes_real(i * CS, std::cmp::min((i + 1) * CS, self.len())) }); let chunk = &chunk[startofs..endofs]; - let write_len = std::cmp::min(chunk.len(), len); - out_buf[written..written + write_len] - .copy_from_slice(&chunk); + let write_len = std::cmp::min(chunk.len(), len as usize); + out_buf[written..written + write_len].copy_from_slice(&chunk); written += write_len; } @@ -157,7 +147,7 @@ impl FileHandle for FSFile { } } impl HasLen for FSFile { - fn len(&self) -> usize { + fn len(&self) -> Ulen { self.len } } diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 93cefced10..6c2cce2267 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -1,5 +1,6 @@ use crate::directory::FileHandle; use stable_deref_trait::StableDeref; +use tantivy_fst::Ulen; //use tantivy_fst::FakeArr; use std::convert::TryInto; use std::mem; @@ -18,8 +19,8 @@ pub struct OwnedBytes { } impl FileHandle for OwnedBytes { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { - Ok(self.slice(from, to)) + fn read_bytes(&self, from: Ulen, to: Ulen) -> io::Result { + Ok(self.slice(from as usize, to as usize)) } } diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 589247e451..21bd578256 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -115,7 +115,7 @@ impl InnerDirectory { } fn total_mem_usage(&self) -> usize { - self.fs.values().map(|f| f.len()).sum() + self.fs.values().map(|f| f.len() as usize).sum() } } diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 29da6de074..671a260bb3 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -3,7 +3,7 @@ use bitpacking::{BitPacker, BitPacker4x}; use tantivy_fst::FakeArr; pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; -const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES; +const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES as usize; mod vint; diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 97f6303267..93a5e91086 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -4,6 +4,7 @@ use crate::directory::{FileSlice, FakeArr}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; +use tantivy_fst::Ulen; use std::cmp; use std::io::{self, Read, Write}; @@ -92,7 +93,7 @@ pub struct TermInfoStore { term_info_bytes: FileSlice, } -fn extract_bits(data: &dyn FakeArr, addr_bits: usize, num_bits: u8) -> u64 { +fn extract_bits(data: &dyn FakeArr, addr_bits: Ulen, num_bits: u8) -> u64 { assert!(num_bits <= 56); let addr_byte = addr_bits / 8; let bit_shift = (addr_bits % 8) as u64; From 6a0cf7eede4223dd444024b5e68ef9e3c8c1be16 Mon Sep 17 00:00:00 2001 From: phiresky Date: Sat, 22 May 2021 10:46:31 +0200 Subject: [PATCH 6/9] WORKS! --- src/collector/facet_collector.rs | 2 +- src/common/bitpacker.rs | 3 ++ src/common/bitset.rs | 2 + src/common/composite_file.rs | 20 ++++---- src/common/serialize.rs | 16 +++--- src/common/vint.rs | 1 + src/core/executor.rs | 7 ++- src/core/index.rs | 2 + src/core/inverted_index_reader.rs | 10 ++-- src/core/searcher.rs | 2 + src/directory/file_slice.rs | 7 ++- src/directory/mmap_directory.rs | 5 +- src/directory/owned_bytes.rs | 18 ++++--- src/directory/ram_directory.rs | 7 +-- src/directory/watch_event_router.rs | 5 +- src/docset.rs | 6 ++- src/fastfield/bytes/reader.rs | 12 +++-- src/fastfield/delete.rs | 10 ++-- src/fastfield/facet_reader.rs | 4 +- src/fastfield/mod.rs | 15 +++--- src/fastfield/multivalued/reader.rs | 10 ++-- src/fastfield/multivalued/writer.rs | 5 +- src/fastfield/readers.rs | 4 +- src/fastfield/serializer.rs | 6 ++- src/fastfield/writer.rs | 3 +- src/fieldnorm/code.rs | 2 + src/fieldnorm/reader.rs | 8 +-- src/fieldnorm/writer.rs | 2 + src/functional_test.rs | 3 +- src/indexer/delete_queue.rs | 6 ++- src/indexer/log_merge_policy.rs | 2 + src/indexer/merger.rs | 5 +- src/indexer/segment_updater.rs | 1 + src/lib.rs | 1 + src/positions/mod.rs | 3 +- src/positions/reader.rs | 51 ++++++++++---------- src/positions/serializer.rs | 1 + src/postings/block_search.rs | 12 +++-- src/postings/block_segment_postings.rs | 18 +++---- src/postings/compression/vint.rs | 10 ++-- src/postings/segment_postings.rs | 10 ++-- src/postings/serializer.rs | 10 ++-- src/postings/skip.rs | 20 ++++---- src/postings/term_info.rs | 4 +- src/query/vec_docset.rs | 6 ++- src/reader/mod.rs | 2 + src/reader/pool.rs | 6 ++- src/schema/document.rs | 4 +- src/schema/schema.rs | 1 + src/schema/term.rs | 2 + src/space_usage/mod.rs | 3 +- src/store/index/block.rs | 4 +- src/store/index/mod.rs | 3 ++ src/store/index/skip_index.rs | 2 + src/store/index/skip_index_builder.rs | 2 + src/store/mod.rs | 4 +- src/store/reader.rs | 19 ++++---- src/store/writer.rs | 6 ++- src/termdict/fst_termdict/term_info_store.rs | 34 ++++++------- src/termdict/fst_termdict/termdict.rs | 6 +-- src/termdict/merger.rs | 2 + src/termdict/tests.rs | 2 + 62 files changed, 283 insertions(+), 176 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index cd10cbaf06..81b854d655 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -269,7 +269,7 @@ impl Collector for FacetCollector { let mut collapse_mapping = Vec::new(); let mut counts = Vec::new(); - let mut collapse_facet_ords = Vec::new(); + let mut collapse_facet_ords: Vec = Vec::new(); let mut collapse_facet_it = self.facets.iter().peekable(); collapse_facet_ords.push(0); diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 640d8adcff..881beb484a 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -1,4 +1,5 @@ use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; +use tantivy_fst::Ulen; use std::io; use crate::directory::OwnedBytes; @@ -103,6 +104,8 @@ impl BitUnpacker { #[cfg(test)] mod test { + use tantivy_fst::Ulen; + use super::{BitPacker, BitUnpacker}; use crate::directory::OwnedBytes; diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 0a8d6f4de7..a09d8e2c90 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -1,6 +1,8 @@ use std::fmt; use std::u64; +use tantivy_fst::Ulen; + #[derive(Clone, Copy, Eq, PartialEq)] pub(crate) struct TinySet(u64); diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 5982743153..af184b3414 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::BinarySerializable; use crate::common::CountingWriter; use crate::common::VInt; @@ -14,11 +16,11 @@ use super::HasLen; #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] pub struct FileAddr { field: Field, - idx: usize, + idx: Ulen, } impl FileAddr { - fn new(field: Field, idx: usize) -> FileAddr { + fn new(field: Field, idx: Ulen) -> FileAddr { FileAddr { field, idx } } } @@ -32,7 +34,7 @@ impl BinarySerializable for FileAddr { fn deserialize(reader: &mut R) -> io::Result { let field = Field::deserialize(reader)?; - let idx = VInt::deserialize(reader)?.0 as usize; + let idx = VInt::deserialize(reader)?.0 as Ulen; Ok(FileAddr { field, idx }) } } @@ -59,7 +61,7 @@ impl CompositeWrite { } /// Start writing a new field. - pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter { + pub fn for_field_with_idx(&mut self, field: Field, idx: Ulen) -> &mut CountingWriter { let offset = self.write.written_bytes(); let file_addr = FileAddr::new(field, idx); assert!(!self.offsets.contains_key(&file_addr)); @@ -105,7 +107,7 @@ impl CompositeWrite { #[derive(Clone)] pub struct CompositeFile { data: FileSlice, - offsets_index: HashMap, + offsets_index: HashMap, } impl CompositeFile { @@ -114,7 +116,7 @@ impl CompositeFile { pub fn open(data: &FileSlice) -> io::Result { let end = data.len(); let footer_len_data = data.slice_from(end - 4).read_bytes()?; - let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; + let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as Ulen; let footer_start = end - 4 - footer_len; let footer_data = data .slice(footer_start, footer_start + footer_len) @@ -128,7 +130,7 @@ impl CompositeFile { let mut offset = 0; for _ in 0..num_fields { - offset += VInt::deserialize(&mut footer_buffer)?.0 as usize; + offset += VInt::deserialize(&mut footer_buffer)?.0 as Ulen; let file_addr = FileAddr::deserialize(&mut footer_buffer)?; offsets.push(offset); file_addrs.push(file_addr); @@ -164,7 +166,7 @@ impl CompositeFile { /// Returns the `FileSlice` associated /// to a given `Field` and stored in a `CompositeFile`. - pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option { + pub fn open_read_with_idx(&self, field: Field, idx: Ulen) -> Option { self.offsets_index .get(&FileAddr { field, idx }) .map(|&(from, to)| self.data.slice(from, to)) @@ -176,7 +178,7 @@ impl CompositeFile { fields .entry(field_addr.field) .or_insert_with(|| FieldUsage::empty(field_addr.field)) - .add_field_idx(field_addr.idx, end - start); + .add_field_idx(field_addr.idx as usize, end - start); } PerFieldSpaceUsage::new(fields) } diff --git a/src/common/serialize.rs b/src/common/serialize.rs index 98f1896947..86168e5403 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -31,7 +31,7 @@ impl BinarySerializable for () { } impl FixedSize for () { - const SIZE_IN_BYTES: usize = 0; + const SIZE_IN_BYTES: Ulen = 0; } impl BinarySerializable for Vec { @@ -74,7 +74,7 @@ impl BinarySerializable for u32 { } impl FixedSize for u32 { - const SIZE_IN_BYTES: usize = 4; + const SIZE_IN_BYTES: Ulen = 4; } impl BinarySerializable for u64 { @@ -87,7 +87,7 @@ impl BinarySerializable for u64 { } impl FixedSize for u64 { - const SIZE_IN_BYTES: usize = 8; + const SIZE_IN_BYTES: Ulen = 8; } impl BinarySerializable for f32 { @@ -100,7 +100,7 @@ impl BinarySerializable for f32 { } impl FixedSize for f32 { - const SIZE_IN_BYTES: usize = 4; + const SIZE_IN_BYTES: Ulen = 4; } impl BinarySerializable for i64 { @@ -113,7 +113,7 @@ impl BinarySerializable for i64 { } impl FixedSize for i64 { - const SIZE_IN_BYTES: usize = 8; + const SIZE_IN_BYTES: Ulen = 8; } impl BinarySerializable for f64 { @@ -126,7 +126,7 @@ impl BinarySerializable for f64 { } impl FixedSize for f64 { - const SIZE_IN_BYTES: usize = 8; + const SIZE_IN_BYTES: Ulen = 8; } impl BinarySerializable for u8 { @@ -139,7 +139,7 @@ impl BinarySerializable for u8 { } impl FixedSize for u8 { - const SIZE_IN_BYTES: usize = 1; + const SIZE_IN_BYTES: Ulen = 1; } impl BinarySerializable for String { @@ -168,7 +168,7 @@ pub mod test { pub fn fixed_size_test() { let mut buffer = Vec::new(); O::default().serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), O::SIZE_IN_BYTES); + assert_eq!(buffer.len(), O::SIZE_IN_BYTES as usize); } fn serialize_test(v: T) -> usize { diff --git a/src/common/vint.rs b/src/common/vint.rs index 6bbce4641d..f295b6276e 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -1,5 +1,6 @@ use super::BinarySerializable; use byteorder::{ByteOrder, LittleEndian}; +use tantivy_fst::Ulen; use std::io; use std::io::Read; use std::io::Write; diff --git a/src/core/executor.rs b/src/core/executor.rs index 8ac39a7eb5..d0459026a1 100644 --- a/src/core/executor.rs +++ b/src/core/executor.rs @@ -1,5 +1,6 @@ use crossbeam::channel; use rayon::{ThreadPool, ThreadPoolBuilder}; +use tantivy_fst::Ulen; /// Search executor whether search request are single thread or multithread. /// @@ -87,12 +88,14 @@ impl Executor { #[cfg(test)] mod tests { + use tantivy_fst::Ulen; + use super::Executor; #[test] #[should_panic(expected = "panic should propagate")] fn test_panic_propagates_single_thread() { - let _result: Vec = Executor::single_thread() + let _result: Vec = Executor::single_thread() .map( |_| { panic!("panic should propagate"); @@ -105,7 +108,7 @@ mod tests { #[test] #[should_panic] //< unfortunately the panic message is not propagated fn test_panic_propagates_multi_thread() { - let _result: Vec = Executor::multi_thread(1, "search-test") + let _result: Vec = Executor::multi_thread(1, "search-test") .unwrap() .map( |_| { diff --git a/src/core/index.rs b/src/core/index.rs index 32f064ac1f..2ab65affc9 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::segment::Segment; use crate::core::Executor; use crate::core::IndexMeta; diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 9c4a1d049c..23349bec39 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,5 +1,7 @@ use std::io; +use tantivy_fst::Ulen; + use crate::common::BinarySerializable; use crate::directory::FileSlice; use crate::positions::PositionReader; @@ -90,8 +92,8 @@ impl InvertedIndexReader { term_info: &TermInfo, block_postings: &mut BlockSegmentPostings, ) -> io::Result<()> { - let start_offset = term_info.postings_start_offset as usize; - let stop_offset = term_info.postings_stop_offset as usize; + let start_offset = term_info.postings_start_offset as Ulen; + let stop_offset = term_info.postings_stop_offset as Ulen; let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset); block_postings.reset(term_info.doc_freq, postings_slice); Ok(()) @@ -121,8 +123,8 @@ impl InvertedIndexReader { requested_option: IndexRecordOption, ) -> io::Result { let postings_data = self.postings_file_slice.slice( - term_info.postings_start_offset as usize, - term_info.postings_stop_offset as usize, + term_info.postings_start_offset as Ulen, + term_info.postings_stop_offset as Ulen, ); BlockSegmentPostings::open( term_info.doc_freq, diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 7123cfcf4a..b181762e61 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::collector::Collector; use crate::core::Executor; diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index 3af5c2cea1..a676818b52 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -50,7 +50,7 @@ impl FileHandle for &'static [u8] { impl> HasLen for T { fn len(&self) -> Ulen { - self.as_ref().len() as Ulen + (self.as_ref() as &[u8]).len() as Ulen } } @@ -159,6 +159,11 @@ impl FileSlice { self.slice(from_offset, ::len(&self)) } + /// like slice_from but inplace + pub fn advance(&mut self, from_offset: Ulen) { + self.start += from_offset; + } + /// Like `.slice(...)` but enforcing only the `to` /// boundary. /// diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 184795d9ea..3bdb371502 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -14,6 +14,7 @@ use fs2::FileExt; use memmap::Mmap; use serde::{Deserialize, Serialize}; use stable_deref_trait::StableDeref; +use tantivy_fst::Ulen; use std::convert::From; use std::fmt; use std::fs::OpenOptions; @@ -62,10 +63,10 @@ fn open_mmap(full_path: &Path) -> result::Result, OpenReadError> { #[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct CacheCounters { // Number of time the cache prevents to call `mmap` - pub hit: usize, + pub hit: Ulen, // Number of time tantivy had to call `mmap` // as no entry was in the cache. - pub miss: usize, + pub miss: Ulen, } #[derive(Clone, Debug, Serialize, Deserialize)] diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 6c2cce2267..0f739d08eb 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -8,6 +8,8 @@ use std::ops::Deref; use std::sync::Arc; use std::{fmt, io}; +use super::FileSlice; + /// An OwnedBytes simply wraps an object that owns a slice of data and exposes /// this data as a static slice. /// @@ -26,11 +28,11 @@ impl FileHandle for OwnedBytes { /*impl FakeArr for OwnedBytes { - fn len(&self) -> usize { + fn len(&self) -> Ulen { self.data.len() } - fn read_into(&self, offset: usize, buf: &mut [u8]) -> std::io::Result<()> { + fn read_into(&self, offset: Ulen, buf: &mut [u8]) -> std::io::Result<()> { let bytes = self.read_bytes(offset, offset + buf.len())?; buf.copy_from_slice(&bytes[..]); Ok(()) @@ -76,10 +78,14 @@ impl OwnedBytes { self.data } + pub fn as_file_slice(self) -> FileSlice { + FileSlice::new(Box::new(self)) + } + /// Returns the len of the slice. #[inline(always)] - pub fn len(&self) -> usize { - self.data.len() + pub fn len(&self) -> Ulen { + self.data.len() as Ulen } /// Splits the OwnedBytes into two OwnedBytes `(left, right)`. @@ -111,10 +117,10 @@ impl OwnedBytes { /// Drops the left most `advance_len` bytes. /// - /// See also [.clip(clip_len: usize))](#method.clip). + /// See also [.clip(clip_len: Ulen))](#method.clip). #[inline(always)] pub fn advance(&mut self, advance_len: usize) { - self.data = &self.data[advance_len..] + self.data = &self.data[advance_len as usize..] } /// Reads an `u8` from the `OwnedBytes` and advance by one byte. diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 21bd578256..5514ac0539 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -5,6 +5,7 @@ use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle}; use crate::directory::{TerminatingWrite, WritePtr}; use crate::{common::HasLen, core::META_FILEPATH}; use fail::fail_point; +use tantivy_fst::Ulen; use std::collections::HashMap; use std::fmt; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write}; @@ -114,8 +115,8 @@ impl InnerDirectory { self.watch_router.subscribe(watch_handle) } - fn total_mem_usage(&self) -> usize { - self.fs.values().map(|f| f.len() as usize).sum() + fn total_mem_usage(&self) -> Ulen { + self.fs.values().map(|f| f.len() as Ulen).sum() } } @@ -137,7 +138,7 @@ impl RAMDirectory { /// Returns the sum of the size of the different files /// in the RAMDirectory. - pub fn total_mem_usage(&self) -> usize { + pub fn total_mem_usage(&self) -> Ulen { self.fs.read().unwrap().total_mem_usage() } diff --git a/src/directory/watch_event_router.rs b/src/directory/watch_event_router.rs index 72160dee95..a54be9161e 100644 --- a/src/directory/watch_event_router.rs +++ b/src/directory/watch_event_router.rs @@ -110,8 +110,9 @@ impl WatchCallbackList { mod tests { use crate::directory::{WatchCallback, WatchCallbackList}; use futures::executor::block_on; - use std::mem; - use std::sync::atomic::{AtomicUsize, Ordering}; + use tantivy_fst::Ulen; + use std::{mem, sync::atomic::AtomicUsize}; + use std::sync::atomic::{Ordering}; use std::sync::Arc; #[test] diff --git a/src/docset.rs b/src/docset.rs index 3c5dfdd315..6c5ca1bcf7 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::fastfield::DeleteBitSet; use crate::DocId; use std::borrow::Borrow; @@ -67,10 +69,10 @@ pub trait DocSet: Send { for (i, buffer_val) in buffer.iter_mut().enumerate() { *buffer_val = self.doc(); if self.advance() == TERMINATED { - return i + 1; + return i + 1 as usize; } } - buffer.len() + buffer.len() as usize } /// Returns the current document diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index 123d6a89be..f192b16cab 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::fastfield::FastFieldReader; @@ -28,20 +30,20 @@ impl BytesFastFieldReader { Ok(BytesFastFieldReader { idx_reader, values }) } - fn range(&self, doc: DocId) -> (usize, usize) { - let start = self.idx_reader.get(doc) as usize; - let stop = self.idx_reader.get(doc + 1) as usize; + fn range(&self, doc: DocId) -> (Ulen, Ulen) { + let start = self.idx_reader.get(doc) as Ulen; + let stop = self.idx_reader.get(doc + 1) as Ulen; (start, stop) } /// Returns the bytes associated to the given `doc` pub fn get_bytes(&self, doc: DocId) -> &[u8] { let (start, stop) = self.range(doc); - &self.values.as_slice()[start..stop] + &self.values.as_slice()[start as usize..stop as usize] } /// Returns the overall number of bytes in this bytes fast field. - pub fn total_num_bytes(&self) -> usize { + pub fn total_num_bytes(&self) -> Ulen { self.values.len() } } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 58d9b77b34..80c87fe760 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::{BitSet, HasLen}; use crate::directory::FileSlice; use crate::directory::OwnedBytes; @@ -41,7 +43,7 @@ pub fn write_delete_bitset( #[derive(Clone)] pub struct DeleteBitSet { data: OwnedBytes, - len: usize, + len: Ulen, } impl DeleteBitSet { @@ -66,10 +68,10 @@ impl DeleteBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted: usize = bytes + let num_deleted: Ulen = bytes .as_slice() .iter() - .map(|b| b.count_ones() as usize) + .map(|b| b.count_ones() as Ulen) .sum(); Ok(DeleteBitSet { data: bytes, @@ -98,7 +100,7 @@ impl DeleteBitSet { } impl HasLen for DeleteBitSet { - fn len(&self) -> usize { + fn len(&self) -> Ulen { self.len } } diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 6f802c153a..c8b95f7952 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::MultiValuedFastFieldReader; use crate::error::DataCorruption; use crate::schema::Facet; @@ -49,7 +51,7 @@ impl FacetReader { /// as deleted. /// /// `Facet` ordinals range from `0` to `num_facets() - 1`. - pub fn num_facets(&self) -> usize { + pub fn num_facets(&self) -> Ulen { self.term_dict.num_terms() } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index f7e9348d4b..1e12d0ee34 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -214,6 +214,7 @@ mod tests { use rand::prelude::SliceRandom; use rand::rngs::StdRng; use rand::SeedableRng; + use tantivy_fst::Ulen; use std::collections::HashMap; use std::path::Path; @@ -256,7 +257,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(&path).unwrap(); - assert_eq!(file.len(), 36 as usize); + assert_eq!(file.len(), 36 as Ulen); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); let fast_field_reader = FastFieldReader::::open(file)?; @@ -287,7 +288,7 @@ mod tests { serializer.close()?; } let file = directory.open_read(&path)?; - assert_eq!(file.len(), 61 as usize); + assert_eq!(file.len(), 61 as Ulen); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -323,7 +324,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(&path).unwrap(); - assert_eq!(file.len(), 34 as usize); + assert_eq!(file.len(), 34 as Ulen); { let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -355,7 +356,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(&path).unwrap(); - assert_eq!(file.len(), 80042 as usize); + assert_eq!(file.len(), 80042 as Ulen); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -394,7 +395,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(&path).unwrap(); - assert_eq!(file.len(), 17709 as usize); + assert_eq!(file.len(), 17709 as Ulen); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(i64_field).unwrap(); @@ -589,7 +590,7 @@ mod bench { let n = test::black_box(7000u32); let mut a = 0u64; for i in (0u32..n / 7).map(|v| v * 7) { - a ^= permutation[i as usize]; + a ^= permutation[i as Ulen]; } a }); @@ -602,7 +603,7 @@ mod bench { let n = test::black_box(1000u32); let mut a = 0u64; for _ in 0u32..n { - a = permutation[a as usize]; + a = permutation[a as Ulen]; } a }); diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index ac0d7775da..5b11309949 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::fastfield::{FastFieldReader, FastValue}; use crate::DocId; @@ -37,15 +39,15 @@ impl MultiValuedFastFieldReader { /// Returns the array of values associated to the given `doc`. pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { let (start, stop) = self.range(doc); - let len = (stop - start) as usize; - vals.resize(len, Item::make_zero()); + let len = (stop - start) as Ulen; + vals.resize(len as usize, Item::make_zero()); self.vals_reader.get_range_u64(start, &mut vals[..]); } /// Returns the number of values associated with the document `DocId`. - pub fn num_vals(&self, doc: DocId) -> usize { + pub fn num_vals(&self, doc: DocId) -> Ulen { let (start, stop) = self.range(doc); - (stop - start) as usize + (stop - start) as Ulen } /// Returns the overall number of values in this field . diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 9caf116ed9..b12d284fce 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -6,6 +6,7 @@ use crate::schema::{Document, Field}; use crate::termdict::TermOrdinal; use crate::DocId; use fnv::FnvHashMap; +use tantivy_fst::Ulen; use std::io; /// Writer for multi-valued (as in, more than one value per document) @@ -136,10 +137,10 @@ impl MultiValuedFastFieldWriter { .windows(2) .map(|interval| (interval[0], interval[1])) .chain(Some(last_interval).into_iter()) - .map(|(start, stop)| (start as usize, stop as usize)) + .map(|(start, stop)| (start as Ulen, stop as Ulen)) { doc_vals.clear(); - let remapped_vals = self.vals[start..stop] + let remapped_vals = self.vals[start as usize..stop as usize] .iter() .map(|val| *mapping.get(val).expect("Missing term ordinal")); doc_vals.extend(remapped_vals); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 84d5a11fd2..2bec979383 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::CompositeFile; use crate::directory::FileSlice; use crate::fastfield::MultiValuedFastFieldReader; @@ -58,7 +60,7 @@ impl FastFieldReaders { self.fast_fields_composite.space_usage() } - fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result { + fn fast_field_data(&self, field: Field, idx: Ulen) -> crate::Result { self.fast_fields_composite .open_read_with_idx(field, idx) .ok_or_else(|| { diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 60f3c1b975..f82b849581 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::bitpacker::BitPacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; @@ -55,7 +57,7 @@ impl FastFieldSerializer { field: Field, min_value: u64, max_value: u64, - idx: usize, + idx: Ulen, ) -> io::Result>> { let field_write = self.composite_write.for_field_with_idx(field, idx); FastSingleFieldSerializer::open(field_write, min_value, max_value) @@ -65,7 +67,7 @@ impl FastFieldSerializer { pub fn new_bytes_fast_field_with_idx( &mut self, field: Field, - idx: usize, + idx: Ulen, ) -> io::Result>> { let field_write = self.composite_write.for_field_with_idx(field, idx); FastBytesFieldSerializer::open(field_write) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 79f9965ce2..3a46814dd3 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -7,6 +7,7 @@ use crate::postings::UnorderedTermId; use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema}; use crate::termdict::TermOrdinal; use fnv::FnvHashMap; +use tantivy_fst::Ulen; use std::collections::HashMap; use std::io; @@ -158,7 +159,7 @@ impl FastFieldsWriter { pub struct IntFastFieldWriter { field: Field, vals: Vec, - val_count: usize, + val_count: Ulen, val_if_missing: u64, val_min: u64, val_max: u64, diff --git a/src/fieldnorm/code.rs b/src/fieldnorm/code.rs index 4c4e78e733..fb066fc69a 100644 --- a/src/fieldnorm/code.rs +++ b/src/fieldnorm/code.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + #[inline(always)] pub fn id_to_fieldnorm(id: u8) -> u32 { FIELD_NORMS_TABLE[id as usize] diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index f1f8ef805b..ada4270b0e 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,4 +1,4 @@ -use tantivy_fst::FakeArr; +use tantivy_fst::{FakeArr, Ulen}; use super::{fieldnorm_to_id, id_to_fieldnorm}; use crate::{HasLen, common::CompositeFile}; @@ -126,7 +126,7 @@ impl FieldNormReader { pub fn fieldnorm(&self, doc_id: DocId) -> u32 { match &self.0 { ReaderImplEnum::FromData(data) => { - let fieldnorm_id = data.get_byte(doc_id as usize); + let fieldnorm_id = data.get_byte(doc_id as Ulen); id_to_fieldnorm(fieldnorm_id) } ReaderImplEnum::Const { fieldnorm, .. } => *fieldnorm, @@ -138,7 +138,7 @@ impl FieldNormReader { pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { match &self.0 { ReaderImplEnum::FromData(data) => { - let fieldnorm_id = data.get_byte(doc_id as usize); + let fieldnorm_id = data.get_byte(doc_id as Ulen); fieldnorm_id } ReaderImplEnum::Const { fieldnorm_id, .. } => *fieldnorm_id, @@ -166,7 +166,7 @@ impl FieldNormReader { .map(FieldNormReader::fieldnorm_to_id) .collect::>(); let field_norms_data = OwnedBytes::new(field_norms_id); - FieldNormReader::new(field_norms_data) + FieldNormReader::new(field_norms_data.as_file_slice()) } } diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 061522e5c8..a66f0c4b8b 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::DocId; use super::fieldnorm_to_id; diff --git a/src/functional_test.rs b/src/functional_test.rs index 478a996861..1fbca2426e 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -3,6 +3,7 @@ use crate::Searcher; use crate::{doc, schema::*}; use rand::thread_rng; use rand::Rng; +use tantivy_fst::Ulen; use std::collections::HashSet; fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> { @@ -37,7 +38,7 @@ fn test_functional_store() -> crate::Result<()> { let mut doc_id = 0u64; for iteration in 0..500 { dbg!(iteration); - let num_docs: usize = rng.gen_range(0..4); + let num_docs: Ulen = rng.gen_range(0..4); if doc_set.len() >= 1 { let doc_to_remove_id = rng.gen_range(0..doc_set.len()); let removed_doc_id = doc_set.swap_remove(doc_to_remove_id); diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index ba445dd3be..fe22745c78 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::operation::DeleteOperation; use crate::Opstamp; use std::mem; @@ -247,6 +249,8 @@ impl DeleteCursor { #[cfg(test)] mod tests { + use tantivy_fst::Ulen; + use super::{DeleteOperation, DeleteQueue}; use crate::schema::{Field, Term}; @@ -254,7 +258,7 @@ mod tests { fn test_deletequeue() { let delete_queue = DeleteQueue::new(); - let make_op = |i: usize| { + let make_op = |i: Ulen| { let field = Field::from_field_id(1u32); DeleteOperation { opstamp: i as u64, diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 455c7d4559..7318dd444f 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::merge_policy::{MergeCandidate, MergePolicy}; use crate::core::SegmentMeta; use std::cmp; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index d91f2f8e3f..bee5df0685 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::MAX_DOC_LIMIT; use crate::core::Segment; use crate::core::SegmentReader; @@ -27,7 +29,7 @@ use std::sync::Arc; fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { let mut total_tokens = 0u64; - let mut count: [usize; 256] = [0; 256]; + let mut count: [Ulen; 256] = [0; 256]; for reader in readers { if reader.has_deletes() { // if there are deletes, then we use an approximation @@ -725,6 +727,7 @@ mod tests { use byteorder::{BigEndian, ReadBytesExt}; use futures::executor::block_on; use schema::FAST; + use tantivy_fst::Ulen; #[test] fn test_index_merger_no_deletes() -> crate::Result<()> { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index d0cb240bc3..f312f30d76 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -23,6 +23,7 @@ use futures::channel::oneshot; use futures::executor::{ThreadPool, ThreadPoolBuilder}; use futures::future::Future; use futures::future::TryFutureExt; +use tantivy_fst::Ulen; use std::borrow::BorrowMut; use std::collections::HashSet; use std::io::Write; diff --git a/src/lib.rs b/src/lib.rs index 4f985785fc..6fd6038a23 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -295,6 +295,7 @@ mod tests { use rand::distributions::Uniform; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; + use tantivy_fst::Ulen; /// Checks if left and right are close one to each other. /// Panics if the two values are more than 0.5% apart. diff --git a/src/positions/mod.rs b/src/positions/mod.rs index d2bc278559..9052d5ef73 100644 --- a/src/positions/mod.rs +++ b/src/positions/mod.rs @@ -29,8 +29,9 @@ mod serializer; pub use self::reader::PositionReader; pub use self::serializer::PositionSerializer; use bitpacking::{BitPacker, BitPacker4x}; +use tantivy_fst::Ulen; -const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; +const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN as usize; const LONG_SKIP_IN_BLOCKS: usize = 1_024; const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64; diff --git a/src/positions/reader.rs b/src/positions/reader.rs index eded0c613a..d58573c37d 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -7,6 +7,7 @@ use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::positions::LONG_SKIP_INTERVAL; use crate::positions::LONG_SKIP_IN_BLOCKS; use bitpacking::{BitPacker, BitPacker4x}; +use tantivy_fst::{FakeArr, Ulen}; /// Positions works as a long sequence of compressed block. /// All terms are chained one after the other. @@ -33,7 +34,7 @@ struct Positions { bit_packer: BitPacker4x, skip_file: FileSlice, position_file: FileSlice, - long_skip_data: OwnedBytes, + long_skip_data: FileSlice, } impl Positions { @@ -42,12 +43,11 @@ impl Positions { let footer_data = footer.read_bytes()?; let num_long_skips = u32::deserialize(&mut footer_data.as_slice())?; let (skip_file, long_skip_file) = - body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize)); - let long_skip_data = long_skip_file.read_bytes()?; + body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as Ulen)); Ok(Positions { bit_packer: BitPacker4x::new(), skip_file, - long_skip_data, + long_skip_data: long_skip_file, position_file, }) } @@ -55,26 +55,24 @@ impl Positions { /// Returns the offset of the block associated to the given `long_skip_id`. /// /// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks. - fn long_skip(&self, long_skip_id: usize) -> u64 { + fn long_skip(&self, long_skip_id: Ulen) -> u64 { if long_skip_id == 0 { return 0; } - let long_skip_slice = self.long_skip_data.as_slice(); - let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8]; + let from = (long_skip_id - 1) * 8; + let mut long_skip_blocks: &[u8] = &self.long_skip_data.slice(from, from + 8).to_vec(); u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") } fn reader(&self, offset: u64) -> io::Result { - let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize; + let long_skip_id = (offset / LONG_SKIP_INTERVAL) as Ulen; let offset_num_bytes: u64 = self.long_skip(long_skip_id); let position_read = self .position_file - .slice_from(offset_num_bytes as usize) - .read_bytes()?; + .slice_from(offset_num_bytes as Ulen); let skip_read = self .skip_file - .slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS) - .read_bytes()?; + .slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS as Ulen); Ok(PositionReader { bit_packer: self.bit_packer, skip_read, @@ -89,10 +87,10 @@ impl Positions { #[derive(Clone)] pub struct PositionReader { - skip_read: OwnedBytes, - position_read: OwnedBytes, + skip_read: FileSlice, + position_read: FileSlice, bit_packer: BitPacker4x, - buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>, + buffer: Box<[u32; COMPRESSION_BLOCK_SIZE as usize]>, block_offset: u64, anchor_offset: u64, @@ -110,15 +108,15 @@ impl PositionReader { positions.reader(offset) } - fn advance_num_blocks(&mut self, num_blocks: usize) { - let num_bits: usize = self.skip_read.as_ref()[..num_blocks] + fn advance_num_blocks(&mut self, num_blocks: Ulen) { + let num_bits: usize = self.skip_read.slice(0, num_blocks).to_vec() .iter() .cloned() .map(|num_bits| num_bits as usize) .sum(); let num_bytes_to_skip = num_bits * COMPRESSION_BLOCK_SIZE / 8; - self.skip_read.advance(num_blocks as usize); - self.position_read.advance(num_bytes_to_skip); + self.skip_read.advance(num_blocks as Ulen); + self.position_read.advance(num_bytes_to_skip as Ulen); } /// Fills a buffer with the positions `[offset..offset+output.len())` integers. @@ -137,22 +135,23 @@ impl PositionReader { // We need to decompress the first block. let delta_to_anchor_offset = offset - self.anchor_offset; let num_blocks_to_skip = - (delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as usize; + (delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as Ulen; self.advance_num_blocks(num_blocks_to_skip); self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64); self.block_offset = self.anchor_offset; - let num_bits = self.skip_read.as_slice()[0]; + let num_bits = self.skip_read.get_byte(0); self.bit_packer - .decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits); + .decompress(&self.position_read.to_vec(), self.buffer.as_mut(), num_bits); } else { let num_blocks_to_skip = - ((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as usize; + ((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as Ulen; self.advance_num_blocks(num_blocks_to_skip); self.anchor_offset = self.block_offset; } - let mut num_bits = self.skip_read.as_slice()[0]; - let mut position_data = self.position_read.as_ref(); + let mut num_bits = self.skip_read.get_byte(0); + let position_data = self.position_read.to_vec(); + let mut position_data = position_data.as_slice(); for i in 1.. { let offset_in_block = (offset as usize) % COMPRESSION_BLOCK_SIZE; @@ -165,7 +164,7 @@ impl PositionReader { output = &mut output[remaining_in_block..]; offset += remaining_in_block as u64; position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..]; - num_bits = self.skip_read.as_slice()[i]; + num_bits = self.skip_read.get_byte(i); self.bit_packer .decompress(position_data, self.buffer.as_mut(), num_bits); self.block_offset += COMPRESSION_BLOCK_SIZE as u64; diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 72eb652b41..b86d7be5bb 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -3,6 +3,7 @@ use crate::common::CountingWriter; use crate::positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; use bitpacking::BitPacker; use bitpacking::BitPacker4x; +use tantivy_fst::Ulen; use std::io::{self, Write}; pub struct PositionSerializer { diff --git a/src/postings/block_search.rs b/src/postings/block_search.rs index 08cd553796..1b9e16147f 100644 --- a/src/postings/block_search.rs +++ b/src/postings/block_search.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::postings::compression::AlignedBuffer; /// This modules define the logic used to search for a doc in a given @@ -8,6 +10,8 @@ use crate::postings::compression::AlignedBuffer; #[cfg(target_arch = "x86_64")] mod sse2 { + use tantivy_fst::Ulen; + use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE}; use std::arch::x86_64::__m128i as DataType; use std::arch::x86_64::_mm_add_epi32 as op_add; @@ -53,7 +57,7 @@ mod sse2 { #[test] fn test_linear_search_sse2_128_u32() { - let mut block = [0u32; COMPRESSION_BLOCK_SIZE]; + let mut block = [0u32; COMPRESSION_BLOCK_SIZE as usize]; for el in 0u32..128u32 { block[el as usize] = el * 2 + 1 << 18; } @@ -72,7 +76,7 @@ fn linear_search(arr: &[u32], target: u32) -> usize { arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum() } -fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) { +fn exponential_search(arr: &[u32], target: u32) -> (usize,usize) { let end = arr.len(); let mut begin = 0; for &pivot in &[1, 3, 7, 15, 31, 63] { @@ -159,6 +163,8 @@ impl Default for BlockSearcher { #[cfg(test)] mod tests { + use tantivy_fst::Ulen; + use super::exponential_search; use super::linear_search; use super::BlockSearcher; @@ -193,7 +199,7 @@ mod tests { fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) { let cursor = search_in_block_trivial_but_slow(block, target); assert!(block.len() < COMPRESSION_BLOCK_SIZE); - let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE]; + let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE as usize]; output_buffer[..block.len()].copy_from_slice(block); assert_eq!( block_searcher.search_in_block(&AlignedBuffer(output_buffer), target), diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 6633d73611..7cbfe6edd4 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -11,7 +11,7 @@ use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; use crate::query::BM25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; -use tantivy_fst::FakeArr; +use tantivy_fst::{FakeArr, Ulen}; fn max_score>(mut it: I) -> Option { if let Some(first) = it.next() { @@ -31,7 +31,7 @@ fn max_score>(mut it: I) -> Option { #[derive(Clone)] pub struct BlockSegmentPostings { pub(crate) doc_decoder: BlockDecoder, - loaded_offset: usize, + loaded_offset: Ulen, freq_decoder: BlockDecoder, freq_reading_option: FreqReadingOption, block_max_score_cache: Option, @@ -50,13 +50,13 @@ fn decode_bitpacked_block( doc_num_bits: u8, tf_num_bits: u8, ) { - let num_bytes_docs = 128 * (doc_num_bits as usize) / 8; // 128 integers per bitpacker4x block. should be same as num_consumed_bytes returned by uncompress block + let num_bytes_docs = 128 * (doc_num_bits as Ulen) / 8; // 128 integers per bitpacker4x block. should be same as num_consumed_bytes returned by uncompress block let num_bytes_freqs = freq_decoder_opt.as_ref() - .map(|_| 128 * (tf_num_bits as usize) / 8) + .map(|_| 128 * (tf_num_bits as Ulen) / 8) .unwrap_or(0); let data = data.slice((0..num_bytes_docs + num_bytes_freqs).into()).to_vec(); let num_consumed_bytes = doc_decoder.uncompress_block_sorted(&data, doc_offset, doc_num_bits); - assert_eq!(num_bytes_docs, num_consumed_bytes); + assert_eq!(num_bytes_docs, num_consumed_bytes as Ulen); if let Some(freq_decoder) = freq_decoder_opt { freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits); } @@ -73,7 +73,7 @@ fn decode_vint_block( doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED); if let Some(freq_decoder) = freq_decoder_opt { freq_decoder.uncompress_vint_unsorted( - &data.slice((num_consumed_bytes..).into()), + &data.slice((num_consumed_bytes as Ulen..).into()), num_vint_docs, TERMINATED, ); @@ -87,7 +87,7 @@ fn split_into_skips_and_postings(doc_freq: u32, data: FileSlice) -> (Option(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { @@ -45,12 +45,12 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a #[inline(always)] pub fn uncompress_sorted(compressed_data: &dyn FakeArr, output: &mut [u32], offset: u32) -> usize { - let mut read_byte = 0; + let mut read_byte: usize = 0; let mut result = offset; for output_mut in output.iter_mut() { let mut shift = 0u32; loop { - let cur_byte = compressed_data.get_byte(read_byte); + let cur_byte = compressed_data.get_byte(read_byte as Ulen); read_byte += 1; result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { @@ -65,12 +65,12 @@ pub fn uncompress_sorted(compressed_data: &dyn FakeArr, output: &mut [u32], offs #[inline(always)] pub(crate) fn uncompress_unsorted(compressed_data: &dyn FakeArr, output_arr: &mut [u32]) -> usize { - let mut read_byte = 0; + let mut read_byte: usize = 0; for output_mut in output_arr.iter_mut() { let mut result = 0u32; let mut shift = 0u32; loop { - let cur_byte = compressed_data.get_byte(read_byte); + let cur_byte = compressed_data.get_byte(read_byte as Ulen); read_byte += 1; result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 1c79b8b1e2..000bc5dbca 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::HasLen; use crate::docset::DocSet; use crate::fastfield::DeleteBitSet; @@ -99,9 +101,9 @@ impl SegmentPostings { fieldnorms: Option<&[u32]>, ) -> SegmentPostings { use crate::directory::FileSlice; + use crate::fieldnorm::FieldNormReader; use crate::postings::serializer::PostingsSerializer; use crate::schema::IndexRecordOption; - use crate::fieldnorm::FieldNormReader; use crate::Score; let mut buffer: Vec = Vec::new(); let fieldnorm_reader = fieldnorms.map(FieldNormReader::for_test); @@ -165,7 +167,7 @@ impl DocSet for SegmentPostings { #[inline] fn advance(&mut self) -> DocId { debug_assert!(self.block_cursor.block_is_loaded()); - if self.cur == COMPRESSION_BLOCK_SIZE - 1 { + if self.cur == (COMPRESSION_BLOCK_SIZE - 1) { self.cur = 0; self.block_cursor.advance(); } else { @@ -215,8 +217,8 @@ impl DocSet for SegmentPostings { } impl HasLen for SegmentPostings { - fn len(&self) -> usize { - self.block_cursor.doc_freq() as usize + fn len(&self) -> Ulen { + self.block_cursor.doc_freq() as Ulen } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 9e8d9c39a2..ecd9f5b318 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::TermInfo; use crate::common::{BinarySerializable, VInt}; use crate::common::{CompositeWrite, CountingWriter}; @@ -263,16 +265,16 @@ impl<'a> FieldSerializer<'a> { } struct Block { - doc_ids: [DocId; COMPRESSION_BLOCK_SIZE], - term_freqs: [u32; COMPRESSION_BLOCK_SIZE], + doc_ids: [DocId; COMPRESSION_BLOCK_SIZE as usize], + term_freqs: [u32; COMPRESSION_BLOCK_SIZE as usize], len: usize, } impl Block { fn new() -> Self { Block { - doc_ids: [0u32; COMPRESSION_BLOCK_SIZE], - term_freqs: [0u32; COMPRESSION_BLOCK_SIZE], + doc_ids: [0u32; COMPRESSION_BLOCK_SIZE as usize], + term_freqs: [0u32; COMPRESSION_BLOCK_SIZE as usize], len: 0, } } diff --git a/src/postings/skip.rs b/src/postings/skip.rs index f2e9c8bab9..81d7540c73 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,6 +1,6 @@ use std::convert::TryInto; -use tantivy_fst::FakeArr; +use tantivy_fst::{FakeArr, Ulen}; use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; @@ -75,7 +75,7 @@ pub(crate) struct SkipReader { pub(crate) last_doc_in_previous_block: DocId, owned_read: FileSlice, skip_info: IndexRecordOption, - byte_offset: usize, + byte_offset: Ulen, remaining_docs: u32, // number of docs remaining, including the // documents in the current block. block_info: BlockInfo, @@ -166,13 +166,13 @@ impl SkipReader { } #[inline(always)] - pub fn byte_offset(&self) -> usize { + pub fn byte_offset(&self) -> Ulen { self.byte_offset } fn read_block_info(&mut self) { let bytes = &self.owned_read.slice_to(std::cmp::min(12, self.owned_read.len())).to_vec(); - let advance_len: usize; + let advance_len: Ulen; self.last_doc_in_block = read_u32(bytes); let doc_num_bits = bytes[4]; match self.skip_info { @@ -246,13 +246,13 @@ impl SkipReader { .. } => { self.remaining_docs -= COMPRESSION_BLOCK_SIZE as u32; - self.byte_offset += compressed_block_size(doc_num_bits + tf_num_bits); + self.byte_offset += compressed_block_size(doc_num_bits + tf_num_bits) as Ulen; self.position_offset += tf_sum as u64; } BlockInfo::VInt { num_docs } => { debug_assert_eq!(num_docs, self.remaining_docs); self.remaining_docs = 0; - self.byte_offset = std::usize::MAX; + self.byte_offset = Ulen::MAX; } } self.last_doc_in_previous_block = self.last_doc_in_block; @@ -273,7 +273,7 @@ mod tests { use super::BlockInfo; use super::IndexRecordOption; use super::{SkipReader, SkipSerializer}; - use crate::directory::OwnedBytes; + use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; #[test] @@ -308,7 +308,7 @@ mod tests { }; let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; let mut skip_reader = - SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::WithFreqs); + SkipReader::new(OwnedBytes::new(buf).as_file_slice(), doc_freq, IndexRecordOption::WithFreqs); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info, @@ -350,7 +350,7 @@ mod tests { }; let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; let mut skip_reader = - SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic); + SkipReader::new(OwnedBytes::new(buf).as_file_slice(), doc_freq, IndexRecordOption::Basic); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), @@ -391,7 +391,7 @@ mod tests { }; let doc_freq = COMPRESSION_BLOCK_SIZE as u32; let mut skip_reader = - SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic); + SkipReader::new(OwnedBytes::new(buf).as_file_slice(), doc_freq, IndexRecordOption::Basic); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 4e08f2e9f7..f9fa270270 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::{BinarySerializable, FixedSize}; use std::io; @@ -29,7 +31,7 @@ impl FixedSize for TermInfo { /// This is large, but in practise, `TermInfo` are encoded in blocks and /// only the first `TermInfo` of a block is serialized uncompressed. /// The subsequent `TermInfo` are delta encoded and bitpacked. - const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; + const SIZE_IN_BYTES: Ulen = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; } impl BinarySerializable for TermInfo { diff --git a/src/query/vec_docset.rs b/src/query/vec_docset.rs index 89f32bd7f0..e766dddf9c 100644 --- a/src/query/vec_docset.rs +++ b/src/query/vec_docset.rs @@ -1,5 +1,7 @@ #![allow(dead_code)] +use tantivy_fst::Ulen; + use crate::common::HasLen; use crate::docset::{DocSet, TERMINATED}; use crate::DocId; @@ -43,8 +45,8 @@ impl DocSet for VecDocSet { } impl HasLen for VecDocSet { - fn len(&self) -> usize { - self.doc_ids.len() + fn len(&self) -> Ulen { + self.doc_ids.len() as Ulen } } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 679abc7c0a..7f8749163a 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1,5 +1,7 @@ mod pool; +use tantivy_fst::Ulen; + pub use self::pool::LeasedItem; use self::pool::Pool; use crate::core::Segment; diff --git a/src/reader/pool.rs b/src/reader/pool.rs index 7134e5adb1..d11a2b89ba 100644 --- a/src/reader/pool.rs +++ b/src/reader/pool.rs @@ -1,5 +1,6 @@ use crossbeam::channel::unbounded; use crossbeam::channel::{Receiver, RecvError, Sender}; +use tantivy_fst::Ulen; use std::ops::{Deref, DerefMut}; use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering; @@ -187,17 +188,18 @@ mod tests { use super::Pool; use super::Queue; use crossbeam::channel; + use tantivy_fst::Ulen; use std::{iter, mem}; #[test] fn test_pool() { - let items10: Vec = iter::repeat(10).take(10).collect(); + let items10: Vec = iter::repeat(10).take(10).collect(); let pool = Pool::new(); pool.publish_new_generation(items10); for _ in 0..20 { assert_eq!(*pool.acquire(), 10); } - let items11: Vec = iter::repeat(11).take(10).collect(); + let items11: Vec = iter::repeat(11).take(10).collect(); pool.publish_new_generation(items11); for _ in 0..20 { assert_eq!(*pool.acquire(), 11); diff --git a/src/schema/document.rs b/src/schema/document.rs index 1887821f2e..5c45d5c9d0 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::*; use crate::common::BinarySerializable; use crate::common::VInt; @@ -200,7 +202,7 @@ impl BinarySerializable for Document { } fn deserialize(reader: &mut R) -> io::Result { - let num_field_values = VInt::deserialize(reader)?.val() as usize; + let num_field_values = VInt::deserialize(reader)?.val() as Ulen; let field_values = (0..num_field_values) .map(|_| FieldValue::deserialize(reader)) .collect::>>()?; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 8bdb7c1405..0204e998ef 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -9,6 +9,7 @@ use serde::de::{SeqAccess, Visitor}; use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::{self, Map as JsonObject, Value as JsonValue}; +use tantivy_fst::Ulen; use std::fmt; /// Tantivy has a very strict schema. diff --git a/src/schema/term.rs b/src/schema/term.rs index 0662e5230b..772cb31cfc 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,5 +1,7 @@ use std::fmt; +use tantivy_fst::Ulen; + use super::Field; use crate::common; use crate::schema::Facet; diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 3bad8f8b05..72ba96fe43 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -12,10 +12,11 @@ under-count actual resultant space usage by up to 4095 bytes per file. use crate::schema::Field; use crate::SegmentComponent; use serde::{Deserialize, Serialize}; +use tantivy_fst::Ulen; use std::collections::HashMap; /// Indicates space usage in bytes -pub type ByteCount = usize; +pub type ByteCount = Ulen; /// Enum containing any of the possible space usage results for segment components. pub enum ComponentSpaceUsage { diff --git a/src/store/index/block.rs b/src/store/index/block.rs index 33785748c4..8c8d740a26 100644 --- a/src/store/index/block.rs +++ b/src/store/index/block.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::VInt; use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; use crate::DocId; @@ -81,7 +83,7 @@ impl CheckpointBlock { return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "")); } self.checkpoints.clear(); - let len = VInt::deserialize_u64(data)? as usize; + let len = VInt::deserialize_u64(data)? as Ulen; if len == 0 { return Ok(()); } diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 4e93128763..eeab3da4f3 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -5,6 +5,8 @@ mod block; mod skip_index; mod skip_index_builder; +use tantivy_fst::Ulen; + use crate::DocId; pub use self::skip_index::SkipIndex; @@ -49,6 +51,7 @@ mod tests { use futures::executor::block_on; use proptest::strategy::{BoxedStrategy, Strategy}; + use tantivy_fst::Ulen; use crate::directory::OwnedBytes; use crate::indexer::NoMergePolicy; diff --git a/src/store/index/skip_index.rs b/src/store/index/skip_index.rs index f64dc5efd8..2eaf416edc 100644 --- a/src/store/index/skip_index.rs +++ b/src/store/index/skip_index.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::{BinarySerializable, VInt}; use crate::directory::OwnedBytes; use crate::store::index::block::CheckpointBlock; diff --git a/src/store/index/skip_index_builder.rs b/src/store/index/skip_index_builder.rs index 6d46dabed8..ea9d8e630d 100644 --- a/src/store/index/skip_index_builder.rs +++ b/src/store/index/skip_index_builder.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::common::{BinarySerializable, VInt}; use crate::store::index::block::CheckpointBlock; use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; diff --git a/src/store/mod.rs b/src/store/mod.rs index 6eff6ddd70..b1e2dd8e6d 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -66,6 +66,8 @@ use self::compression_snap::{compress, decompress}; #[cfg(test)] pub mod tests { + use tantivy_fst::Ulen; + use super::*; use crate::directory::{Directory, RAMDirectory, WritePtr}; use crate::schema::Document; @@ -74,7 +76,7 @@ pub mod tests { use crate::schema::TextOptions; use std::path::Path; - pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema { + pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: Ulen) -> Schema { let mut schema_builder = Schema::builder(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); let field_title = diff --git a/src/store/reader.rs b/src/store/reader.rs index adcf114301..2fff046813 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -8,12 +8,13 @@ use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; use lru::LruCache; +use tantivy_fst::Ulen; use std::io; use std::mem::size_of; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; -const LRU_CACHE_CAPACITY: usize = 100; +const LRU_CACHE_CAPACITY: Ulen = 100; type Block = Arc>; @@ -38,7 +39,7 @@ impl StoreReader { let skip_index = SkipIndex::open(index_data); Ok(StoreReader { data: data_file, - cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))), + cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY as usize))), cache_hits: Default::default(), cache_misses: Default::default(), skip_index: Arc::new(skip_index), @@ -61,8 +62,8 @@ impl StoreReader { fn compressed_block(&self, checkpoint: &Checkpoint) -> io::Result { self.data .slice( - checkpoint.start_offset as usize, - checkpoint.end_offset as usize, + checkpoint.start_offset as Ulen, + checkpoint.end_offset as Ulen, ) .read_bytes() } @@ -117,10 +118,10 @@ impl StoreReader { } fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> { - let (data, footer_len_bytes) = data.split_from_end(size_of::()); + let (data, footer_len_bytes) = data.split_from_end(size_of::() as Ulen); let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?; let mut serialized_offset_buf = serialized_offset.as_slice(); - let offset = u64::deserialize(&mut serialized_offset_buf)? as usize; + let offset = u64::deserialize(&mut serialized_offset_buf)? as Ulen; Ok(data.split(offset)) } @@ -162,7 +163,7 @@ mod tests { .lock() .unwrap() .peek_lru() - .map(|(&k, _)| k as usize), + .map(|(&k, _)| k as Ulen), Some(0) ); @@ -179,7 +180,7 @@ mod tests { .lock() .unwrap() .peek_lru() - .map(|(&k, _)| k as usize), + .map(|(&k, _)| k as Ulen), Some(0) ); @@ -195,7 +196,7 @@ mod tests { .lock() .unwrap() .peek_lru() - .map(|(&k, _)| k as usize), + .map(|(&k, _)| k as Ulen), Some(18806) ); diff --git a/src/store/writer.rs b/src/store/writer.rs index 3309f1a648..2f1f7ab6f9 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::compress; use super::index::SkipIndexBuilder; use super::StoreReader; @@ -10,7 +12,7 @@ use crate::store::index::Checkpoint; use crate::DocId; use std::io::{self, Write}; -const BLOCK_SIZE: usize = 16_384; +const BLOCK_SIZE: Ulen = 16_384; /// Write tantivy's [`Store`](./index.html) /// @@ -58,7 +60,7 @@ impl StoreWriter { self.current_block .write_all(&self.intermediary_buffer[..])?; self.doc += 1; - if self.current_block.len() > BLOCK_SIZE { + if self.current_block.len() > BLOCK_SIZE as usize { self.write_and_compress_block()?; } Ok(()) diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 93a5e91086..f7dea1ecf4 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -5,7 +5,7 @@ use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; use tantivy_fst::Ulen; -use std::cmp; +use std::{cmp, convert::TryInto}; use std::io::{self, Read, Write}; const BLOCK_LEN: usize = 256; @@ -47,7 +47,7 @@ impl BinarySerializable for TermInfoBlockMeta { } impl FixedSize for TermInfoBlockMeta { - const SIZE_IN_BYTES: usize = + const SIZE_IN_BYTES: Ulen = u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES; } @@ -61,13 +61,13 @@ impl TermInfoBlockMeta { // is encoded without bitpacking. fn deserialize_term_info(&self, data: &dyn FakeArr, inner_offset: usize) -> TermInfo { assert!(inner_offset < BLOCK_LEN - 1); - let num_bits = self.num_bits() as usize; + let num_bits = self.num_bits() as Ulen; - let posting_start_addr = num_bits * inner_offset; + let posting_start_addr = num_bits * inner_offset as Ulen; // the stop offset is the start offset of the next term info. let posting_stop_addr = posting_start_addr + num_bits; - let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize; - let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize; + let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as Ulen; + let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as Ulen; let postings_start_offset = self.ref_term_info.postings_start_offset + extract_bits(data, posting_start_addr, self.postings_offset_nbits); @@ -88,7 +88,7 @@ impl TermInfoBlockMeta { #[derive(Debug)] pub struct TermInfoStore { - num_terms: usize, + num_terms: Ulen, block_meta_bytes: FileSlice, term_info_bytes: FileSlice, } @@ -118,8 +118,8 @@ impl TermInfoStore { pub fn open(term_info_store_file: FileSlice) -> crate::Result { let (len_slice, main_slice) = term_info_store_file.split(16); let mut bytes = len_slice.read_bytes()?; - let len = u64::deserialize(&mut bytes)? as usize; - let num_terms = u64::deserialize(&mut bytes)? as usize; + let len = u64::deserialize(&mut bytes)? as Ulen; + let num_terms = u64::deserialize(&mut bytes)? as Ulen; let (block_meta_file, term_info_file) = main_slice.split(len); Ok(TermInfoStore { num_terms, @@ -129,23 +129,23 @@ impl TermInfoStore { } pub fn get(&self, term_ord: TermOrdinal) -> TermInfo { - let block_id = (term_ord as usize) / BLOCK_LEN; + let block_id = (term_ord) / (BLOCK_LEN as Ulen); let block_data = self.block_meta_bytes.slice(block_id * TermInfoBlockMeta::SIZE_IN_BYTES, HasLen::len(&self.block_meta_bytes)); let mut block_data = block_data.full_slice(); let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data) .expect("Failed to deserialize terminfoblockmeta"); - let inner_offset = (term_ord as usize) % BLOCK_LEN; + let inner_offset = (term_ord as Ulen) % (BLOCK_LEN as Ulen); if inner_offset == 0 { return term_info_block_data.ref_term_info; } - let term_info_data = self.term_info_bytes.slice(term_info_block_data.offset as usize, HasLen::len(&self.term_info_bytes)); + let term_info_data = self.term_info_bytes.slice(term_info_block_data.offset, HasLen::len(&self.term_info_bytes)); term_info_block_data.deserialize_term_info( &term_info_data, - inner_offset - 1, + (inner_offset - 1).try_into().unwrap(), ) } - pub fn num_terms(&self) -> usize { + pub fn num_terms(&self) -> Ulen { self.num_terms } } @@ -304,9 +304,9 @@ mod tests { assert_eq!(compute_num_bits(51), 6); bitpack.close(&mut buffer).unwrap(); assert_eq!(buffer.len(), 3 + 7); - assert_eq!(extract_bits(buffer, 0, 9), 321u64); - assert_eq!(extract_bits(buffer, 9, 2), 2u64); - assert_eq!(extract_bits(buffer, 11, 6), 51u64); + assert_eq!(extract_bits(&buffer, 0, 9), 321u64); + assert_eq!(extract_bits(&buffer, 9, 2), 2u64); + assert_eq!(extract_bits(&buffer, 11, 6), 51u64); } #[test] diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index 50a4f2dd5c..fbca4b0e03 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -7,7 +7,7 @@ use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use once_cell::sync::Lazy; use std::io::{self, Write}; -use tantivy_fst::{raw::Fst}; +use tantivy_fst::{Ulen, raw::Fst}; use tantivy_fst::Automaton; fn convert_fst_error(e: tantivy_fst::Error) -> io::Error { @@ -122,7 +122,7 @@ impl TermDictionary { let (main_slice, footer_len_slice) = file.split_from_end(8); let mut footer_len_bytes = footer_len_slice.read_bytes()?; let footer_size = u64::deserialize(&mut footer_len_bytes)?; - let (fst_file_slice, values_file_slice) = main_slice.split_from_end(footer_size as usize); + let (fst_file_slice, values_file_slice) = main_slice.split_from_end(footer_size as Ulen); let fst_index = open_fst_index(fst_file_slice)?; let term_info_store = TermInfoStore::open(values_file_slice)?; Ok(TermDictionary { @@ -138,7 +138,7 @@ impl TermDictionary { /// Returns the number of terms in the dictionary. /// Term ordinals range from 0 to `num_terms() - 1`. - pub fn num_terms(&self) -> usize { + pub fn num_terms(&self) -> Ulen { self.term_info_store.num_terms() } diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 924f31c8ba..81be689377 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use crate::schema::Term; use crate::termdict::TermOrdinal; use crate::termdict::TermStreamer; diff --git a/src/termdict/tests.rs b/src/termdict/tests.rs index 9e0bde752a..b37cf34ae3 100644 --- a/src/termdict/tests.rs +++ b/src/termdict/tests.rs @@ -1,3 +1,5 @@ +use tantivy_fst::Ulen; + use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use crate::directory::{Directory, FileSlice, RAMDirectory, TerminatingWrite}; From cecf63544e158fc723a9b009520e9d0165f805b1 Mon Sep 17 00:00:00 2001 From: phiresky Date: Mon, 24 May 2021 17:32:22 +0200 Subject: [PATCH 7/9] some info logs --- src/collector/top_score_collector.rs | 1 + src/core/inverted_index_reader.rs | 15 +++++++++++---- src/core/searcher.rs | 4 +++- src/lib.rs | 17 ++++++++++++++--- src/schema/term.rs | 16 ++++++++++------ src/termdict/fst_termdict/termdict.rs | 6 ++---- 6 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index f58095b212..a0841da6dd 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -649,6 +649,7 @@ impl Collector for TopDocs { threshold })?; } else { + crate::info_log(format!("Scoring results and collecting TOP {}", self.0.limit)); weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| { let heap_item = ComparableDoc { feature: score, diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 23349bec39..0189f5f422 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,6 +1,6 @@ use std::io; -use tantivy_fst::Ulen; +use tantivy_fst::{FakeArr, Ulen}; use crate::common::BinarySerializable; use crate::directory::FileSlice; @@ -108,8 +108,10 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> io::Result> { - self.get_term_info(term)? - .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) + crate::info_log(format!("reading term info for term {:?}", term)); + + let info = self.get_term_info(term)?; + info.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) .transpose() } @@ -126,6 +128,8 @@ impl InvertedIndexReader { term_info.postings_start_offset as Ulen, term_info.postings_stop_offset as Ulen, ); + + postings_data.to_vec(); // better force load it all at once BlockSegmentPostings::open( term_info.doc_freq, postings_data, @@ -183,7 +187,10 @@ impl InvertedIndexReader { option: IndexRecordOption, ) -> io::Result> { self.get_term_info(term)? - .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) + .map(move |term_info| { + crate::info_log(format!("Fetching document ids and frequencies matching term {:?}", term)); + self.read_postings_from_terminfo(&term_info, option) + }) .transpose() } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index b181762e61..9d5773943e 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -58,7 +58,9 @@ impl Searcher { pub fn doc(&self, doc_address: DocAddress) -> crate::Result { let DocAddress(segment_local_id, doc_id) = doc_address; let store_reader = &self.store_readers[segment_local_id as usize]; - store_reader.get(doc_id) + let doc = store_reader.get(doc_id)?; + crate::info_log(format!("read content of doc {:?}", doc.field_values())); + Ok(doc) } /// Access the schema associated to the index of this searcher. diff --git a/src/lib.rs b/src/lib.rs index 6fd6038a23..0d255167aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -170,7 +170,7 @@ pub use crate::reader::LeasedItem; pub use crate::schema::{Document, Term}; use std::fmt; -use once_cell::sync::Lazy; +use once_cell::sync::{Lazy, OnceCell}; use serde::{Deserialize, Serialize}; /// Index format version. @@ -280,9 +280,19 @@ impl DocAddress { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct DocAddress(pub SegmentLocalId, pub DocId); +static INFO_LOG_HOOK: OnceCell> = OnceCell::new(); + +pub fn info_log(message: impl AsRef) { + if let Some(log) = INFO_LOG_HOOK.get() { + log(message.as_ref()); + } +} +pub fn set_info_log_hook(f: impl Fn(&str) + Send + Sync + 'static) { + INFO_LOG_HOOK.set(Box::new(f)).ok(); +} + #[cfg(test)] mod tests { - use crate::{Directory, collector::tests::TEST_COLLECTOR_WITH_SCORE}; use crate::core::SegmentReader; use crate::docset::{DocSet, TERMINATED}; use crate::query::BooleanQuery; @@ -291,6 +301,7 @@ mod tests { use crate::Index; use crate::Postings; use crate::ReloadPolicy; + use crate::{collector::tests::TEST_COLLECTOR_WITH_SCORE, Directory}; use rand::distributions::Bernoulli; use rand::distributions::Uniform; use rand::rngs::StdRng; @@ -639,7 +650,7 @@ mod tests { index_writer.commit()?; println!("dir: {:#?}", index.directory()); let reader = index.reader()?; - + let searcher = reader.searcher(); let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher diff --git a/src/schema/term.rs b/src/schema/term.rs index 772cb31cfc..8c0c0432c7 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -233,12 +233,16 @@ where impl fmt::Debug for Term { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Term(field={},bytes={:?})", - self.field().field_id(), - self.value_bytes() - ) + if let Ok(s) = std::str::from_utf8(&self.0) { + write!(f, "Term(field={},text={:?})", self.field().field_id(), s) + } else { + write!( + f, + "Term(field={},bytes={:?})", + self.field().field_id(), + self.value_bytes() + ) + } } } diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index fbca4b0e03..409f4842ef 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -86,11 +86,9 @@ where } fn open_fst_index(fst_file: FileSlice) -> crate::Result> { - println!("open_fst_index()"); let fst = Fst::new(fst_file) .map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?; let ret = Ok(tantivy_fst::Map::from(fst)); - println!("open_fst_index RET"); return ret; } @@ -144,9 +142,9 @@ impl TermDictionary { /// Returns the ordinal associated to a given term. pub fn term_ord>(&self, key: K) -> io::Result> { - println!("termdict.term_ord({:?})", String::from_utf8_lossy(key.as_ref())); + crate::info_log(format!("Getting info for term {:?}", String::from_utf8_lossy(key.as_ref()))); let ret = Ok(self.fst_index.get(key)); - println!("termdict.term_ord RET"); + //crate::info_log(format!("termdict.term_ord RET {:?}", ret)); return ret; } From 57b9c59f4f33dcfe64de09cbd746c393aff89f54 Mon Sep 17 00:00:00 2001 From: phiresky Date: Sat, 29 May 2021 14:10:05 +0200 Subject: [PATCH 8/9] comment --- src/directory/fs_directory.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/directory/fs_directory.rs b/src/directory/fs_directory.rs index 5a29d194cd..e55ca6e489 100644 --- a/src/directory/fs_directory.rs +++ b/src/directory/fs_directory.rs @@ -12,6 +12,9 @@ use super::{ AntiCallToken, WatchCallback, WritePtr, }; +// for demonstration purposes only: a directory that dynamically reads from the filesystem without memory mapping with an integrated cache +// this is *not used* in my wasm demo which uses different caching and hooks into the Web APIs. + #[derive(Debug, Clone)] pub struct FsDirectory { root: PathBuf, From 6bd8a8d9ef702bda9b76119c2732542a8aa3e04e Mon Sep 17 00:00:00 2001 From: phiresky Date: Sun, 30 May 2021 18:30:19 +0200 Subject: [PATCH 9/9] api to get multiple chunks at the same time --- src/core/searcher.rs | 9 +++++++++ src/directory/file_slice.rs | 12 ++++++++++++ src/lib.rs | 6 +++--- src/store/reader.rs | 18 ++++++++++++++++++ 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9d5773943e..564944e32b 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -63,6 +63,15 @@ impl Searcher { Ok(doc) } + pub fn doc_multiple(&self, doc_addresses: Vec) -> crate::Result> { + if doc_addresses.len() == 0 { + return Ok(vec![]); + } + assert!(doc_addresses.windows(2).all(|s| s[0].0 == s[1].0), "only supported on same segment for now"); + let store_reader = &self.store_readers[doc_addresses[0].0 as usize]; + store_reader.get_multiple(&doc_addresses.into_iter().map(|d| d.1).collect::>()) + } + /// Access the schema associated to the index of this searcher. pub fn schema(&self) -> &Schema { &self.schema diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index a676818b52..2aac1addf2 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -5,6 +5,7 @@ use tantivy_fst::Ulen; use crate::common::HasLen; use crate::directory::OwnedBytes; use std::fmt::Debug; +use std::ops::Range; use std::sync::{Arc, Weak}; use std::{io, ops::Deref}; @@ -24,6 +25,12 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + Debug { /// /// This method may panic if the range requested is invalid. fn read_bytes(&self, from: Ulen, to: Ulen) -> io::Result; + + /// Optimization: read multiple at the same time if you can + fn read_bytes_multiple(&self, ranges: &[Range]) -> io::Result> { + crate::info_log("warn: unoptimized read of multiple ranges"); + ranges.iter().map(|r| self.read_bytes(r.start, r.end)).collect() + } } impl FakeArr for FileSlice { @@ -134,6 +141,11 @@ impl FileSlice { self.data.read_bytes(self.start + from, self.start + to) } + pub fn read_bytes_slice_multiple(&self, ranges: &[Range]) -> io::Result> { + let real_ranges: Vec> = ranges.into_iter().map(|r| (r.start + self.start)..(r.end + self.start)).collect(); + self.data.read_bytes_multiple(&real_ranges) + } + /// Splits the FileSlice at the given offset and return two file slices. /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. /// diff --git a/src/lib.rs b/src/lib.rs index 0d255167aa..8f0e634e45 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -280,15 +280,15 @@ impl DocAddress { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct DocAddress(pub SegmentLocalId, pub DocId); -static INFO_LOG_HOOK: OnceCell> = OnceCell::new(); +static INFO_LOG_HOOK: OnceCell> = OnceCell::new(); pub fn info_log(message: impl AsRef) { if let Some(log) = INFO_LOG_HOOK.get() { log(message.as_ref()); } } -pub fn set_info_log_hook(f: impl Fn(&str) + Send + Sync + 'static) { - INFO_LOG_HOOK.set(Box::new(f)).ok(); +pub fn set_info_log_hook(f: impl Fn(&str) + Send + Sync + 'static) -> std::result::Result<(), ()> { + INFO_LOG_HOOK.set(Box::new(f)).map_err(|_| ()) } #[cfg(test)] diff --git a/src/store/reader.rs b/src/store/reader.rs index 2fff046813..52505a8629 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -89,6 +89,15 @@ impl StoreReader { Ok(block) } + fn cache_blocks_multiple(&self, checkpoints: &[Checkpoint]) -> io::Result<()> { + // just to cache them so the next read is instant, TODO: don't rely on caching within FileSlice, use self.cache instead? + // crate::info_log("caching multiple"); + let ranges = checkpoints.iter().map(|c| (c.start_offset as Ulen)..(c.end_offset as Ulen)).collect::>(); + self.data.read_bytes_slice_multiple(&ranges)?; + // crate::info_log("caching multiple done"); + Ok(()) + } + /// Reads a given document. /// /// Calling `.get(doc)` is relatively costly as it requires @@ -100,6 +109,7 @@ impl StoreReader { let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| { crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id)) })?; + crate::info_log(format!("decompressing block for doc {}", doc_id)); let mut cursor = &self.read_block(&checkpoint)?[..]; for _ in checkpoint.start_doc..doc_id { let doc_length = VInt::deserialize(&mut cursor)?.val() as usize; @@ -111,6 +121,14 @@ impl StoreReader { Ok(Document::deserialize(&mut cursor)?) } + /// Reads the given document ids. + /// May be faster than getting them separately if the storage backend supports it + pub fn get_multiple(&self, doc_ids: &[DocId]) -> crate::Result> { + let checkpoints: Vec = doc_ids.iter().flat_map(|doc_id| self.block_checkpoint(*doc_id)).collect(); + self.cache_blocks_multiple(&checkpoints)?; + doc_ids.iter().map(|d| self.get(*d)).collect() + } + /// Summarize total space usage of this store reader. pub fn space_usage(&self) -> StoreSpaceUsage { self.space_usage.clone()