Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing Deserializer trait #1489

Merged
merged 2 commits into from
Aug 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions fastfield_codecs/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ extern crate test;

#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer};
use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use fastfield_codecs::linear::{LinearReader, LinearSerializer};
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::*;

fn get_data() -> Vec<u64> {
Expand All @@ -25,16 +25,10 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
b: &mut Bencher,
data: &[u64],
) {
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
S::serialize(&mut bytes, &data).unwrap();
let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
Codec::serialize(&mut bytes, &data).unwrap();
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
Expand All @@ -45,10 +39,11 @@ mod tests {
sum
});
}
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = Vec::new();
b.iter(|| {
S::serialize(&mut bytes, &data).unwrap();
bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
});
}

Expand All @@ -57,32 +52,32 @@ mod tests {
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedSerializer>(b, &data);
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearSerializer>(b, &data);
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearSerializer>(b, &data);
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedSerializer, BitpackedReader>(b, &data);
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearSerializer, LinearReader>(b, &data);
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearSerializer, BlockwiseLinearReader>(b, &data);
bench_get::<BlockwiseLinearCodec>(b, &data);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
Expand Down
69 changes: 33 additions & 36 deletions fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,19 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};

/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
pub num_vals: u64,
min_value_u64: u64,
max_value_u64: u64,
num_vals: u64,
}

impl FastFieldCodecDeserializer for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}
}
impl FastFieldDataAccess for BitpackedReader {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
Expand Down Expand Up @@ -111,12 +89,33 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
}
}

pub struct BitpackedSerializer {}
pub struct BitpackedCodec;

impl FastFieldCodecSerializer for BitpackedSerializer {
impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;

type Reader = BitpackedReader;

/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}

/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The serializer in fact encode the values by bitpacking
Expand All @@ -142,29 +141,27 @@ impl FastFieldCodecSerializer for BitpackedSerializer {

Ok(())
}
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
true
}
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
use crate::tests::get_codec_test_datasets;

fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedSerializer, BitpackedReader>(data, name);
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
}

#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
Expand Down
66 changes: 31 additions & 35 deletions fastfield_codecs/src/blockwise_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::linear::{get_calculated_value, get_slope};
use crate::{
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};

const CHUNK_SIZE: u64 = 512;

Expand Down Expand Up @@ -148,17 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)]
}

impl FastFieldCodecDeserializer for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
}

impl FastFieldDataAccess for BlockwiseLinearReader {
#[inline]
fn get_val(&self, idx: u64) -> u64 {
Expand Down Expand Up @@ -191,10 +178,22 @@ impl FastFieldDataAccess for BlockwiseLinearReader {
}

/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearSerializer {}
pub struct BlockwiseLinearCodec;

impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;

type Reader = BlockwiseLinearReader;

/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}

/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
Expand Down Expand Up @@ -290,10 +289,14 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
Ok(())
}

fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
if fastfield_accessor.num_vals() < 5_000 {
return false;
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;
}

// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
Expand All @@ -305,14 +308,9 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
return None;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
let last_val_in_first_block =
Expand Down Expand Up @@ -351,7 +349,7 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
// function metadata per block
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

Expand All @@ -366,12 +364,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
use crate::tests::get_codec_test_datasets;

fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(
data, name,
)
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
}

const HIGHEST_BIT: u64 = 1 << 63;
Expand All @@ -385,7 +381,7 @@ mod tests {
.map(i64_to_u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large i64");
create_and_validate(&data, "simple monotonically large i64").unwrap();
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
Expand All @@ -396,7 +392,7 @@ mod tests {
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
create_and_validate(&data, "simple monotonically large").unwrap();
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
Expand All @@ -405,7 +401,7 @@ mod tests {

#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
Expand Down
Loading