From b5a3a946f4a3f10b776dcfa7cbc8f7517260c5f3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 25 Apr 2023 09:00:49 +0800 Subject: [PATCH] refactor vint - improve performance of vint vint serialization shows up in performance profiles during indexing. It would also make sense to limit the value space to u29 and operate on 4 bytes only. - remove unused code - add missing inlines - fix regex test --- common/benches/bench.rs | 39 +++++++++++++++++ common/src/lib.rs | 3 +- common/src/vint.rs | 75 +++++++------------------------- src/schema/term.rs | 1 + src/tokenizer/regex_tokenizer.rs | 4 +- stacker/src/expull.rs | 2 + 6 files changed, 61 insertions(+), 63 deletions(-) create mode 100644 common/benches/bench.rs diff --git a/common/benches/bench.rs b/common/benches/bench.rs new file mode 100644 index 0000000000..74ccb555ee --- /dev/null +++ b/common/benches/bench.rs @@ -0,0 +1,39 @@ +#![feature(test)] + +extern crate test; + +#[cfg(test)] +mod tests { + use rand::seq::IteratorRandom; + use rand::thread_rng; + use tantivy_common::serialize_vint_u32; + use test::Bencher; + + #[bench] + fn bench_vint(b: &mut Bencher) { + let vals: Vec = (0..20_000).collect(); + b.iter(|| { + let mut out = 0u64; + for val in vals.iter().cloned() { + let mut buf = [0u8; 8]; + serialize_vint_u32(val, &mut buf); + out += u64::from(buf[0]); + } + out + }); + } + + #[bench] + fn bench_vint_rand(b: &mut Bencher) { + let vals: Vec = (0..20_000).choose_multiple(&mut thread_rng(), 100_000); + b.iter(|| { + let mut out = 0u64; + for val in vals.iter().cloned() { + let mut buf = [0u8; 8]; + serialize_vint_u32(val, &mut buf); + out += u64::from(buf[0]); + } + out + }); + } +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 68a29ed4d7..d2a935c51d 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -19,8 +19,7 @@ pub use group_by::GroupByIteratorExtended; pub use ownedbytes::{OwnedBytes, StableDeref}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use vint::{ - deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128, - serialize_vint_u32, write_u32_vint, VInt, VIntU128, + read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128, }; pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite}; diff --git a/common/src/vint.rs b/common/src/vint.rs index 204f0a25a0..ee8e0aa6f7 100644 --- a/common/src/vint.rs +++ b/common/src/vint.rs @@ -1,8 +1,6 @@ use std::io; use std::io::{Read, Write}; -use byteorder::{ByteOrder, LittleEndian}; - use super::BinarySerializable; /// Variable int serializes a u128 number @@ -19,26 +17,6 @@ pub fn serialize_vint_u128(mut val: u128, output: &mut Vec) { } } -/// Deserializes a u128 number -/// -/// Returns the number and the slice after the vint -pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> { - let mut result = 0u128; - let mut shift = 0u64; - for i in 0..19 { - let b = data[i]; - result |= u128::from(b % 128u8) << shift; - if b >= STOP_BIT { - return Ok((result, &data[i + 1..])); - } - shift += 7; - } - Err(io::Error::new( - io::ErrorKind::InvalidData, - "Failed to deserialize u128 vint", - )) -} - /// Wrapper over a `u128` that serializes as a variable int. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct VIntU128(pub u128); @@ -80,17 +58,13 @@ pub struct VInt(pub u64); const STOP_BIT: u8 = 128; +#[inline] pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] { const START_2: u64 = 1 << 7; const START_3: u64 = 1 << 14; const START_4: u64 = 1 << 21; const START_5: u64 = 1 << 28; - const STOP_1: u64 = START_2 - 1; - const STOP_2: u64 = START_3 - 1; - const STOP_3: u64 = START_4 - 1; - const STOP_4: u64 = START_5 - 1; - const MASK_1: u64 = 127; const MASK_2: u64 = MASK_1 << 7; const MASK_3: u64 = MASK_2 << 7; @@ -99,25 +73,29 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] { let val = u64::from(val); const STOP_BIT: u64 = 128u64; - let (res, num_bytes) = match val { - 0..=STOP_1 => (val | STOP_BIT, 1), - START_2..=STOP_2 => ( + let (res, num_bytes) = if val < START_2 { + (val | STOP_BIT, 1) + } else if val < START_3 { + ( (val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)), 2, - ), - START_3..=STOP_3 => ( + ) + } else if val < START_4 { + ( (val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)), 3, - ), - START_4..=STOP_4 => ( + ) + } else if val < START_5 { + ( (val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | ((val & MASK_4) << 3) | (STOP_BIT << (8 * 3)), 4, - ), - _ => ( + ) + } else { + ( (val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) @@ -125,9 +103,9 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] { | ((val & MASK_5) << 4) | (STOP_BIT << (8 * 4)), 5, - ), + ) }; - LittleEndian::write_u64(&mut buf[..], res); + *buf = res.to_le_bytes(); &buf[0..num_bytes] } @@ -245,7 +223,6 @@ impl BinarySerializable for VInt { mod tests { use super::{serialize_vint_u32, BinarySerializable, VInt}; - use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128}; fn aux_test_vint(val: u64) { let mut v = [14u8; 10]; @@ -287,26 +264,6 @@ mod tests { assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val); } - fn aux_test_vint_u128(val: u128) { - let mut data = vec![]; - serialize_vint_u128(val, &mut data); - let (deser_val, _data) = deserialize_vint_u128(&data).unwrap(); - assert_eq!(val, deser_val); - - let mut out = vec![]; - VIntU128(val).serialize(&mut out).unwrap(); - let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap(); - assert_eq!(val, deser_val.0); - } - - #[test] - fn test_vint_u128() { - aux_test_vint_u128(0); - aux_test_vint_u128(1); - aux_test_vint_u128(u128::MAX / 3); - aux_test_vint_u128(u128::MAX); - } - #[test] fn test_vint_u32() { aux_test_serialize_vint_u32(0); diff --git a/src/schema/term.rs b/src/schema/term.rs index e1411e7c00..2722a4bf66 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -268,6 +268,7 @@ where B: AsRef<[u8]> /// /// Do NOT rely on this byte representation in the index. /// This value is likely to change in the future. + #[inline] pub fn serialized_term(&self) -> &[u8] { self.0.as_ref() } diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index 9030fb04ed..f65a5cece1 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -137,11 +137,11 @@ mod tests { #[test] fn test_regexp_tokenizer_error_on_invalid_regex() { - let tokenizer = RegexTokenizer::new(r"\@"); + let tokenizer = RegexTokenizer::new(r"\@("); assert_eq!(tokenizer.is_err(), true); assert_eq!( tokenizer.err().unwrap().to_string(), - "An invalid argument was passed: '\\@'" + "An invalid argument was passed: '\\@('" ); } diff --git a/stacker/src/expull.rs b/stacker/src/expull.rs index e9d34501eb..2fc565fbcb 100644 --- a/stacker/src/expull.rs +++ b/stacker/src/expull.rs @@ -99,12 +99,14 @@ fn ensure_capacity<'a>( } impl<'a> ExpUnrolledLinkedListWriter<'a> { + #[inline] pub fn write_u32_vint(&mut self, val: u32) { let mut buf = [0u8; 8]; let data = serialize_vint_u32(val, &mut buf); self.extend_from_slice(data); } + #[inline] pub fn extend_from_slice(&mut self, mut buf: &[u8]) { while !buf.is_empty() { let add_len: usize;