Skip to content

Commit

Permalink
refactor vint
Browse files Browse the repository at this point in the history
- improve performance of vint
vint serialization shows up in performance profiles during indexing.
It would also make sense to limit the value space to u29 and operate on 4 bytes only.
- remove unused code
- add missing inlines
- fix regex test
  • Loading branch information
PSeitz committed Apr 25, 2023
1 parent 9c93bfe commit b5a3a94
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 63 deletions.
39 changes: 39 additions & 0 deletions common/benches/bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#![feature(test)]

extern crate test;

#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;

#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}

#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
}
3 changes: 1 addition & 2 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ pub use group_by::GroupByIteratorExtended;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
serialize_vint_u32, write_u32_vint, VInt, VIntU128,
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};

Expand Down
75 changes: 16 additions & 59 deletions common/src/vint.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use std::io;
use std::io::{Read, Write};

use byteorder::{ByteOrder, LittleEndian};

use super::BinarySerializable;

/// Variable int serializes a u128 number
Expand All @@ -19,26 +17,6 @@ pub fn serialize_vint_u128(mut val: u128, output: &mut Vec<u8>) {
}
}

/// Deserializes a u128 number
///
/// Returns the number and the slice after the vint
pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> {
let mut result = 0u128;
let mut shift = 0u64;
for i in 0..19 {
let b = data[i];
result |= u128::from(b % 128u8) << shift;
if b >= STOP_BIT {
return Ok((result, &data[i + 1..]));
}
shift += 7;
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Failed to deserialize u128 vint",
))
}

/// Wrapper over a `u128` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VIntU128(pub u128);
Expand Down Expand Up @@ -80,17 +58,13 @@ pub struct VInt(pub u64);

const STOP_BIT: u8 = 128;

#[inline]
pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;

const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;

const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
Expand All @@ -99,35 +73,39 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {

let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
let (res, num_bytes) = match val {
0..=STOP_1 => (val | STOP_BIT, 1),
START_2..=STOP_2 => (
let (res, num_bytes) = if val < START_2 {
(val | STOP_BIT, 1)
} else if val < START_3 {
(
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3..=STOP_3 => (
)
} else if val < START_4 {
(
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4..=STOP_4 => (
)
} else if val < START_5 {
(
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
)
} else {
(
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
)
};
LittleEndian::write_u64(&mut buf[..], res);
*buf = res.to_le_bytes();
&buf[0..num_bytes]
}

Expand Down Expand Up @@ -245,7 +223,6 @@ impl BinarySerializable for VInt {
mod tests {

use super::{serialize_vint_u32, BinarySerializable, VInt};
use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128};

fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
Expand Down Expand Up @@ -287,26 +264,6 @@ mod tests {
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val);
}

fn aux_test_vint_u128(val: u128) {
let mut data = vec![];
serialize_vint_u128(val, &mut data);
let (deser_val, _data) = deserialize_vint_u128(&data).unwrap();
assert_eq!(val, deser_val);

let mut out = vec![];
VIntU128(val).serialize(&mut out).unwrap();
let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap();
assert_eq!(val, deser_val.0);
}

#[test]
fn test_vint_u128() {
aux_test_vint_u128(0);
aux_test_vint_u128(1);
aux_test_vint_u128(u128::MAX / 3);
aux_test_vint_u128(u128::MAX);
}

#[test]
fn test_vint_u32() {
aux_test_serialize_vint_u32(0);
Expand Down
1 change: 1 addition & 0 deletions src/schema/term.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ where B: AsRef<[u8]>
///
/// Do NOT rely on this byte representation in the index.
/// This value is likely to change in the future.
#[inline]
pub fn serialized_term(&self) -> &[u8] {
self.0.as_ref()
}
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/regex_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ mod tests {

#[test]
fn test_regexp_tokenizer_error_on_invalid_regex() {
let tokenizer = RegexTokenizer::new(r"\@");
let tokenizer = RegexTokenizer::new(r"\@(");
assert_eq!(tokenizer.is_err(), true);
assert_eq!(
tokenizer.err().unwrap().to_string(),
"An invalid argument was passed: '\\@'"
"An invalid argument was passed: '\\@('"
);
}

Expand Down
2 changes: 2 additions & 0 deletions stacker/src/expull.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,14 @@ fn ensure_capacity<'a>(
}

impl<'a> ExpUnrolledLinkedListWriter<'a> {
#[inline]
pub fn write_u32_vint(&mut self, val: u32) {
let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf);
self.extend_from_slice(data);
}

#[inline]
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
while !buf.is_empty() {
let add_len: usize;
Expand Down

0 comments on commit b5a3a94

Please sign in to comment.