From 43a4c8287ca595e2ff5a99451a968162073535ec Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 27 Aug 2022 20:57:06 +0200 Subject: [PATCH 1/2] Removing Deserializer trait And renaming the `Serializer` trait `FastFieldCodec`. --- fastfield_codecs/benches/bench.rs | 37 ++++----- fastfield_codecs/src/bitpacked.rs | 57 +++++++------ fastfield_codecs/src/blockwise_linear.rs | 35 ++++---- fastfield_codecs/src/lib.rs | 94 ++++++++++----------- fastfield_codecs/src/linear.rs | 46 +++++------ fastfield_codecs/src/main.rs | 24 +++--- src/fastfield/gcd.rs | 66 +++++++++------ src/fastfield/mod.rs | 2 +- src/fastfield/reader.rs | 101 ++++++++--------------- src/fastfield/serializer/mod.rs | 30 +++---- 10 files changed, 229 insertions(+), 263 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 3a5ae58760..8b54ccf765 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,9 +4,9 @@ extern crate test; #[cfg(test)] mod tests { - use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer}; - use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; - use fastfield_codecs::linear::{LinearReader, LinearSerializer}; + use fastfield_codecs::bitpacked::BitpackedCodec; + use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; + use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::*; fn get_data() -> Vec { @@ -25,16 +25,10 @@ mod tests { fn value_iter() -> impl Iterator { 0..20_000 } - fn bench_get< - S: FastFieldCodecSerializer, - R: FastFieldCodecDeserializer + FastFieldDataAccess, - >( - b: &mut Bencher, - data: &[u64], - ) { + fn bench_get(b: &mut Bencher, data: &[u64]) { let mut bytes = vec![]; - S::serialize(&mut bytes, &data).unwrap(); - let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); + Codec::serialize(&mut bytes, &data).unwrap(); + let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); b.iter(|| { let mut sum = 0u64; for pos in value_iter() { @@ -45,10 +39,11 @@ mod tests { sum }); } - fn bench_create(b: &mut Bencher, data: &[u64]) { - let mut bytes = vec![]; + fn bench_create(b: &mut Bencher, data: &[u64]) { + let mut bytes = Vec::new(); b.iter(|| { - S::serialize(&mut bytes, &data).unwrap(); + bytes.clear(); + Codec::serialize(&mut bytes, &data).unwrap(); }); } @@ -57,32 +52,32 @@ mod tests { #[bench] fn bench_fastfield_bitpack_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_bitpack_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } #[bench] fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } #[bench] fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { let min_value = data.iter().cloned().min().unwrap_or(0); diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index 43e0ea838d..76f9785ec6 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -4,9 +4,7 @@ use common::BinarySerializable; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -14,31 +12,11 @@ use crate::{ pub struct BitpackedReader { data: OwnedBytes, bit_unpacker: BitUnpacker, - pub min_value_u64: u64, - pub max_value_u64: u64, - pub num_vals: u64, + min_value_u64: u64, + max_value_u64: u64, + num_vals: u64, } -impl FastFieldCodecDeserializer for BitpackedReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - 24; - let (data, mut footer) = bytes.split(footer_offset); - let min_value = u64::deserialize(&mut footer)?; - let amplitude = u64::deserialize(&mut footer)?; - let num_vals = u64::deserialize(&mut footer)?; - let max_value = min_value + amplitude; - let num_bits = compute_num_bits(amplitude); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(BitpackedReader { - data, - bit_unpacker, - min_value_u64: min_value, - max_value_u64: max_value, - num_vals, - }) - } -} impl FastFieldDataAccess for BitpackedReader { #[inline] fn get_val(&self, doc: u64) -> u64 { @@ -111,12 +89,33 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> { } } -pub struct BitpackedSerializer {} +pub struct BitpackedCodec; -impl FastFieldCodecSerializer for BitpackedSerializer { +impl FastFieldCodec for BitpackedCodec { /// The CODEC_TYPE is an enum value used for serialization. const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked; + type Reader = BitpackedReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - 24; + let (data, mut footer) = bytes.split(footer_offset); + let min_value = u64::deserialize(&mut footer)?; + let amplitude = u64::deserialize(&mut footer)?; + let num_vals = u64::deserialize(&mut footer)?; + let max_value = min_value + amplitude; + let num_bits = compute_num_bits(amplitude); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(BitpackedReader { + data, + bit_unpacker, + min_value_u64: min_value, + max_value_u64: max_value, + num_vals, + }) + } + /// Serializes data with the BitpackedFastFieldSerializer. /// /// The serializer in fact encode the values by bitpacking @@ -159,7 +158,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) { - crate::tests::create_and_validate::(data, name); + crate::tests::create_and_validate::(data, name); } #[test] diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 7db3abc29e..6b5e380f76 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::linear::{get_calculated_value, get_slope}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; const CHUNK_SIZE: u64 = 512; @@ -148,17 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio &interpolations[get_interpolation_position(doc)] } -impl FastFieldCodecDeserializer for BlockwiseLinearReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; - let footer_offset = bytes.len() - 4 - footer_len as usize; - let (data, mut footer) = bytes.split(footer_offset); - let footer = BlockwiseLinearFooter::deserialize(&mut footer)?; - Ok(BlockwiseLinearReader { data, footer }) - } -} - impl FastFieldDataAccess for BlockwiseLinearReader { #[inline] fn get_val(&self, idx: u64) -> u64 { @@ -191,10 +178,22 @@ impl FastFieldDataAccess for BlockwiseLinearReader { } /// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements. -pub struct BlockwiseLinearSerializer {} +pub struct BlockwiseLinearCodec; -impl FastFieldCodecSerializer for BlockwiseLinearSerializer { +impl FastFieldCodec for BlockwiseLinearCodec { const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear; + + type Reader = BlockwiseLinearReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; + let footer_offset = bytes.len() - 4 - footer_len as usize; + let (data, mut footer) = bytes.split(footer_offset); + let footer = BlockwiseLinearFooter::deserialize(&mut footer)?; + Ok(BlockwiseLinearReader { data, footer }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -369,9 +368,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::( - data, name, - ) + crate::tests::create_and_validate::(data, name) } const HIGHEST_BIT: u64 = 1 << 63; diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 626a0686c3..c124495268 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -12,12 +12,6 @@ pub mod bitpacked; pub mod blockwise_linear; pub mod linear; -pub trait FastFieldCodecDeserializer: Sized { - /// Reads the metadata and returns the CodecReader - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result - where Self: FastFieldDataAccess; -} - pub trait FastFieldDataAccess { fn get_val(&self, doc: u64) -> u64; fn min_value(&self) -> u64; @@ -69,20 +63,15 @@ impl FastFieldCodecType { /// The FastFieldSerializerEstimate trait is required on all variants /// of fast field compressions, to decide which one to choose. -pub trait FastFieldCodecSerializer { +pub trait FastFieldCodec { /// A codex needs to provide a unique name and id, which is /// used for debugging and de/serialization. const CODEC_TYPE: FastFieldCodecType; - /// Check if the Codec is able to compress the data - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; + type Reader: FastFieldDataAccess; - /// Returns an estimate of the compression ratio. - /// The baseline is uncompressed 64bit data. - /// - /// It could make sense to also return a value representing - /// computational complexity. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; + /// Reads the metadata and returns the CodecReader + fn open_from_bytes(bytes: OwnedBytes) -> io::Result; /// Serializes the data using the serializer into write. /// @@ -92,6 +81,16 @@ pub trait FastFieldCodecSerializer { write: &mut impl Write, fastfield_accessor: &dyn FastFieldDataAccess, ) -> io::Result<()>; + + /// Check if the Codec is able to compress the data + fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; + + /// Returns an estimate of the compression ratio. + /// The baseline is uncompressed 64bit data. + /// + /// It could make sense to also return a value representing + /// computational complexity. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; } #[derive(Debug, Clone)] @@ -149,27 +148,21 @@ mod tests { use proptest::arbitrary::any; use proptest::proptest; - use crate::bitpacked::{BitpackedReader, BitpackedSerializer}; - use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; - use crate::linear::{LinearReader, LinearSerializer}; - - pub fn create_and_validate< - S: FastFieldCodecSerializer, - R: FastFieldCodecDeserializer + FastFieldDataAccess, - >( - data: &[u64], - name: &str, - ) -> (f32, f32) { - if !S::is_applicable(&data) { + use crate::bitpacked::BitpackedCodec; + use crate::blockwise_linear::BlockwiseLinearCodec; + use crate::linear::LinearCodec; + + pub fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { + if !Codec::is_applicable(&data) { return (f32::MAX, 0.0); } - let estimation = S::estimate(&data); + let estimation = Codec::estimate(&data); let mut out: Vec = Vec::new(); - S::serialize(&mut out, &data).unwrap(); + Codec::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); - let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); + let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap(); assert_eq!(reader.num_vals(), data.len() as u64); for (doc, orig_val) in data.iter().enumerate() { let val = reader.get_val(doc as u64); @@ -186,16 +179,16 @@ mod tests { proptest! { #[test] fn test_proptest_small(data in proptest::collection::vec(any::(), 1..10)) { - create_and_validate::(&data, "proptest linearinterpol"); - create_and_validate::(&data, "proptest multilinearinterpol"); - create_and_validate::(&data, "proptest bitpacked"); + create_and_validate::(&data, "proptest linearinterpol"); + create_and_validate::(&data, "proptest multilinearinterpol"); + create_and_validate::(&data, "proptest bitpacked"); } #[test] fn test_proptest_large(data in proptest::collection::vec(any::(), 1..6000)) { - create_and_validate::(&data, "proptest linearinterpol"); - create_and_validate::(&data, "proptest multilinearinterpol"); - create_and_validate::(&data, "proptest bitpacked"); + create_and_validate::(&data, "proptest linearinterpol"); + create_and_validate::(&data, "proptest multilinearinterpol"); + create_and_validate::(&data, "proptest bitpacked"); } } @@ -216,13 +209,10 @@ mod tests { data_and_names } - fn test_codec< - S: FastFieldCodecSerializer, - R: FastFieldDataAccess + FastFieldCodecDeserializer, - >() { - let codec_name = format!("{:?}", S::CODEC_TYPE); + fn test_codec() { + let codec_name = format!("{:?}", C::CODEC_TYPE); for (data, dataset_name) in get_codec_test_data_sets() { - let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); + let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); let result = if estimate == f32::MAX { "Disabled".to_string() } else { @@ -233,15 +223,15 @@ mod tests { } #[test] fn test_codec_bitpacking() { - test_codec::(); + test_codec::(); } #[test] fn test_codec_interpolation() { - test_codec::(); + test_codec::(); } #[test] fn test_codec_multi_interpolation() { - test_codec::(); + test_codec::(); } use super::*; @@ -250,24 +240,24 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.01); - let multi_linear_interpol_estimation = BlockwiseLinearSerializer::estimate(&data); + let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data); assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(linear_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.32); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(bitpacked_estimation, linear_interpol_estimation); } #[test] @@ -277,10 +267,10 @@ mod tests { // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index bf50f7f1b6..d2d53143d7 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -5,9 +5,7 @@ use common::{BinarySerializable, FixedSize}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -59,24 +57,6 @@ impl FixedSize for LinearFooter { const SIZE_IN_BYTES: usize = 56; } -impl FastFieldCodecDeserializer for LinearReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES; - let (data, mut footer) = bytes.split(footer_offset); - let footer = LinearFooter::deserialize(&mut footer)?; - let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); - let num_bits = compute_num_bits(footer.relative_max_value); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(LinearReader { - data, - bit_unpacker, - footer, - slope, - }) - } -} - impl FastFieldDataAccess for LinearReader { #[inline] fn get_val(&self, doc: u64) -> u64 { @@ -100,7 +80,7 @@ impl FastFieldDataAccess for LinearReader { /// Fastfield serializer, which tries to guess values by linear interpolation /// and stores the difference bitpacked. -pub struct LinearSerializer {} +pub struct LinearCodec; #[inline] pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 { @@ -141,9 +121,27 @@ pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { } } -impl FastFieldCodecSerializer for LinearSerializer { +impl FastFieldCodec for LinearCodec { const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear; + type Reader = LinearReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES; + let (data, mut footer) = bytes.split(footer_offset); + let footer = LinearFooter::deserialize(&mut footer)?; + let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); + let num_bits = compute_num_bits(footer.relative_max_value); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(LinearReader { + data, + bit_unpacker, + footer, + slope, + }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -267,7 +265,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + crate::tests::create_and_validate::(data, name) } #[test] diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 4f5a1d2397..93204cb250 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,8 +1,8 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; -use fastfield_codecs::linear::LinearSerializer; -use fastfield_codecs::{FastFieldCodecSerializer, FastFieldCodecType, FastFieldStats}; +use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; +use fastfield_codecs::linear::LinearCodec; +use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats}; use prettytable::{Cell, Row, Table}; fn main() { @@ -13,11 +13,11 @@ fn main() { for (data, data_set_name) in get_codec_test_data_sets() { let mut results = vec![]; - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); // let best_estimation_codec = results @@ -89,19 +89,19 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { data_and_names } -pub fn serialize_with_codec( +pub fn serialize_with_codec( data: &[u64], ) -> (bool, f32, f32, FastFieldCodecType) { - let is_applicable = S::is_applicable(&data); + let is_applicable = C::is_applicable(&data); if !is_applicable { - return (false, 0.0, 0.0, S::CODEC_TYPE); + return (false, 0.0, 0.0, C::CODEC_TYPE); } - let estimation = S::estimate(&data); + let estimation = C::estimate(&data); let mut out = vec![]; - S::serialize(&mut out, &data).unwrap(); + C::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() * 8) as f32; - (true, estimation, actual_compression, S::CODEC_TYPE) + (true, estimation, actual_compression, C::CODEC_TYPE) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/src/fastfield/gcd.rs b/src/fastfield/gcd.rs index 8e706d12ff..37fe38e419 100644 --- a/src/fastfield/gcd.rs +++ b/src/fastfield/gcd.rs @@ -3,7 +3,7 @@ use std::num::NonZeroU64; use common::BinarySerializable; use fastdivide::DividerU64; -use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldDataAccess}; +use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess}; use ownedbytes::OwnedBytes; pub const GCD_DEFAULT: u64 = 1; @@ -12,50 +12,70 @@ pub const GCD_DEFAULT: u64 = 1; /// /// Holds the data and the codec to the read the data. #[derive(Clone)] -pub struct GCDFastFieldCodec { +pub struct GCDReader { + gcd_params: GCDParams, + reader: CodecReader, +} + +#[derive(Debug, Clone, Copy)] +struct GCDParams { gcd: u64, min_value: u64, num_vals: u64, - reader: CodecReader, } -impl FastFieldCodecDeserializer - for GCDFastFieldCodec -{ - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result { - let footer_offset = bytes.len() - 24; - let (body, mut footer) = bytes.split(footer_offset); - let gcd = u64::deserialize(&mut footer)?; - let min_value = u64::deserialize(&mut footer)?; - let num_vals = u64::deserialize(&mut footer)?; - let reader = C::open_from_bytes(body)?; - Ok(GCDFastFieldCodec { +impl GCDParams { + pub fn eval(&self, val: u64) -> u64 { + self.min_value + self.gcd * val + } +} + +impl BinarySerializable for GCDParams { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + self.gcd.serialize(writer)?; + self.min_value.serialize(writer)?; + self.num_vals.serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let gcd: u64 = u64::deserialize(reader)?; + let min_value: u64 = u64::deserialize(reader)?; + let num_vals: u64 = u64::deserialize(reader)?; + Ok(Self { gcd, min_value, num_vals, - reader, }) } } -impl FastFieldDataAccess for GCDFastFieldCodec { +pub fn open_gcd_from_bytes( + bytes: OwnedBytes, +) -> io::Result> { + let footer_offset = bytes.len() - 24; + let (body, mut footer) = bytes.split(footer_offset); + let gcd_params = GCDParams::deserialize(&mut footer)?; + let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?; + Ok(GCDReader { gcd_params, reader }) +} + +impl FastFieldDataAccess for GCDReader { #[inline] fn get_val(&self, doc: u64) -> u64 { - let mut data = self.reader.get_val(doc); - data *= self.gcd; - data += self.min_value; - data + let val = self.reader.get_val(doc); + self.gcd_params.eval(val) } fn min_value(&self) -> u64 { - self.min_value + self.reader.min_value() * self.gcd + self.gcd_params.eval(self.reader.min_value()) } fn max_value(&self) -> u64 { - self.min_value + self.reader.max_value() * self.gcd + self.gcd_params.eval(self.reader.max_value()) } fn num_vals(&self) -> u64 { - self.num_vals + self.gcd_params.num_vals } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 851d5df6a5..c76cd7e4ef 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -26,7 +26,7 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; -pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_DEFAULT}; +pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::reader::{DynamicFastFieldReader, FastFieldReader}; pub use self::readers::FastFieldReaders; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 7afedf6f52..70da86e641 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -3,15 +3,16 @@ use std::marker::PhantomData; use std::path::Path; use common::BinarySerializable; -use fastfield_codecs::bitpacked::BitpackedReader; -use fastfield_codecs::blockwise_linear::BlockwiseLinearReader; -use fastfield_codecs::linear::LinearReader; -use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldCodecType, FastFieldDataAccess}; +use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader}; +use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader}; +use fastfield_codecs::linear::{LinearCodec, LinearReader}; +use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; -use super::{FastValue, GCDFastFieldCodec}; +use super::gcd::open_gcd_from_bytes; +use super::FastValue; use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; use crate::error::DataCorruption; -use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; +use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader}; use crate::schema::{Schema, FAST}; use crate::DocId; @@ -68,11 +69,11 @@ pub enum DynamicFastFieldReader { BlockwiseLinear(FastFieldReaderCodecWrapper), /// GCD and Bitpacked compressed fastfield data. - BitpackedGCD(FastFieldReaderCodecWrapper>), + BitpackedGCD(FastFieldReaderCodecWrapper>), /// GCD and Linear interpolated values + bitpacked - LinearGCD(FastFieldReaderCodecWrapper>), + LinearGCD(FastFieldReaderCodecWrapper>), /// GCD and Blockwise linear interpolated values + bitpacked - BlockwiseLinearGCD(FastFieldReaderCodecWrapper>), + BlockwiseLinearGCD(FastFieldReaderCodecWrapper>), } impl DynamicFastFieldReader { @@ -83,46 +84,27 @@ impl DynamicFastFieldReader { ) -> crate::Result> { let reader = match codec_type { FastFieldCodecType::Bitpacked => { - DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::< - Item, - BitpackedReader, - >::open_from_bytes(bytes)?) + DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into()) } - FastFieldCodecType::Linear => DynamicFastFieldReader::Linear( - FastFieldReaderCodecWrapper::::open_from_bytes(bytes)?, - ), - FastFieldCodecType::BlockwiseLinear => { - DynamicFastFieldReader::BlockwiseLinear(FastFieldReaderCodecWrapper::< - Item, - BlockwiseLinearReader, - >::open_from_bytes(bytes)?) + FastFieldCodecType::Linear => { + DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into()) } + FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear( + BlockwiseLinearCodec::open_from_bytes(bytes)?.into(), + ), FastFieldCodecType::Gcd => { let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; match codec_type { - FastFieldCodecType::Bitpacked => { - DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) - } - FastFieldCodecType::Linear => { - DynamicFastFieldReader::LinearGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) - } + FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD( + open_gcd_from_bytes::(bytes)?.into(), + ), + FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD( + open_gcd_from_bytes::(bytes)?.into(), + ), FastFieldCodecType::BlockwiseLinear => { - DynamicFastFieldReader::BlockwiseLinearGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) + DynamicFastFieldReader::BlockwiseLinearGCD( + open_gcd_from_bytes::(bytes)?.into(), + ) } FastFieldCodecType::Gcd => { return Err(DataCorruption::comment_only( @@ -199,33 +181,18 @@ pub struct FastFieldReaderCodecWrapper { _phantom: PhantomData, } -impl - FastFieldReaderCodecWrapper +impl From + for FastFieldReaderCodecWrapper { - /// Opens a fast field given a file. - pub fn open(file: FileSlice) -> crate::Result { - let mut bytes = file.read_bytes()?; - let codec_code = bytes.read_u8(); - let codec_type = FastFieldCodecType::from_code(codec_code).ok_or_else(|| { - DataCorruption::comment_only("Unknown codec code does not exist `{codec_code}`") - })?; - assert_eq!( - FastFieldCodecType::Bitpacked, - codec_type, - "Tried to open fast field as bitpacked encoded (id=1), but got serializer with \ - different id" - ); - Self::open_from_bytes(bytes) - } - /// Opens a fast field given the bytes. - pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result { - let reader = C::open_from_bytes(bytes)?; - Ok(FastFieldReaderCodecWrapper { + fn from(reader: CodecReader) -> Self { + FastFieldReaderCodecWrapper { reader, _phantom: PhantomData, - }) + } } +} +impl FastFieldReaderCodecWrapper { #[inline] pub(crate) fn get_u64(&self, doc: u64) -> Item { let data = self.reader.get_val(doc); @@ -251,8 +218,8 @@ impl } } -impl - FastFieldReader for FastFieldReaderCodecWrapper +impl FastFieldReader + for FastFieldReaderCodecWrapper { /// Return the value associated to the given document. /// diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 871a049787..6bbb33fafd 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -3,11 +3,11 @@ use std::num::NonZeroU64; use common::{BinarySerializable, CountingWriter}; use fastdivide::DividerU64; -pub use fastfield_codecs::bitpacked::{BitpackedSerializer, BitpackedSerializerLegacy}; -use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; -use fastfield_codecs::linear::LinearSerializer; +pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy}; +use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; +use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::FastFieldCodecType; -pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; +pub use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess, FastFieldStats}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; use crate::directory::{CompositeWrite, WritePtr}; @@ -64,15 +64,15 @@ impl From for FastFieldCodecEnableCheck { // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // https://github.com/rust-lang/rust/pull/86176 -fn codec_estimation( - fastfield_accessor: &A, +fn codec_estimation( + fastfield_accessor: &impl FastFieldDataAccess, estimations: &mut Vec<(f32, FastFieldCodecType)>, ) { - if !T::is_applicable(fastfield_accessor) { + if !C::is_applicable(fastfield_accessor) { return; } - let ratio = T::estimate(fastfield_accessor); - estimations.push((ratio, T::CODEC_TYPE)); + let ratio = C::estimate(fastfield_accessor); + estimations.push((ratio, C::CODEC_TYPE)); } impl CompositeFastFieldSerializer { @@ -204,13 +204,13 @@ impl CompositeFastFieldSerializer { let mut estimations = vec![]; if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) { @@ -229,13 +229,13 @@ impl CompositeFastFieldSerializer { Self::write_header(field_write, codec_type)?; match codec_type { FastFieldCodecType::Bitpacked => { - BitpackedSerializer::serialize(field_write, &fastfield_accessor)?; + BitpackedCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::Linear => { - LinearSerializer::serialize(field_write, &fastfield_accessor)?; + LinearCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::BlockwiseLinear => { - BlockwiseLinearSerializer::serialize(field_write, &fastfield_accessor)?; + BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::Gcd => { return Err(io::Error::new( From e8a6e123ae520e907ce534bf9301343f9868b3dc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 27 Aug 2022 21:26:48 +0200 Subject: [PATCH 2/2] Small refactoring estimate. --- fastfield_codecs/src/bitpacked.rs | 12 ++--- fastfield_codecs/src/blockwise_linear.rs | 33 ++++++------ fastfield_codecs/src/lib.rs | 64 ++++++++++++------------ fastfield_codecs/src/linear.rs | 37 +++++++------- fastfield_codecs/src/main.rs | 47 +++++++---------- src/fastfield/serializer/mod.rs | 6 +-- 6 files changed, 92 insertions(+), 107 deletions(-) diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index 76f9785ec6..4270877bd3 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -141,21 +141,19 @@ impl FastFieldCodec for BitpackedCodec { Ok(()) } - fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool { - true - } - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value(); let num_bits = compute_num_bits(amplitude); let num_bits_uncompressed = 64; - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; fn create_and_validate(data: &[u64], name: &str) { crate::tests::create_and_validate::(data, name); @@ -163,7 +161,7 @@ mod tests { #[test] fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 6b5e380f76..619d1facac 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -289,10 +289,14 @@ impl FastFieldCodec for BlockwiseLinearCodec { Ok(()) } - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { - if fastfield_accessor.num_vals() < 5_000 { - return false; + /// estimation for linear interpolation is hard because, you don't know + /// where the local maxima are for the deviation of the calculated value and + /// the offset is also unknown. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { + if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE { + return None; } + // On serialization the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. @@ -304,14 +308,9 @@ impl FastFieldCodec for BlockwiseLinearCodec { .checked_add(theorethical_maximum_offset) .is_none() { - return false; + return None; } - true - } - /// estimation for linear interpolation is hard because, you don't know - /// where the local maxima are for the deviation of the calculated value and - /// the offset is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + let first_val_in_first_block = fastfield_accessor.get_val(0); let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals()); let last_val_in_first_block = @@ -350,7 +349,7 @@ impl FastFieldCodec for BlockwiseLinearCodec { // function metadata per block + 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE); let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } @@ -365,10 +364,10 @@ fn distance + Ord>(x: T, y: T) -> T { #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; - fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> { + crate::tests::create_and_validate::(data, name) } const HIGHEST_BIT: u64 = 1 << 63; @@ -382,7 +381,7 @@ mod tests { .map(i64_to_u64) .collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large i64"); + create_and_validate(&data, "simple monotonically large i64").unwrap(); assert!(actual_compression < 0.2); assert!(estimate < 0.20); assert!(estimate > 0.15); @@ -393,7 +392,7 @@ mod tests { fn test_compression() { let data = (10..=6_000_u64).collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large"); + create_and_validate(&data, "simple monotonically large").unwrap(); assert!(actual_compression < 0.2); assert!(estimate < 0.20); assert!(estimate > 0.15); @@ -402,7 +401,7 @@ mod tests { #[test] fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index c124495268..172f7e0d90 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -82,15 +82,14 @@ pub trait FastFieldCodec { fastfield_accessor: &dyn FastFieldDataAccess, ) -> io::Result<()>; - /// Check if the Codec is able to compress the data - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; - /// Returns an estimate of the compression ratio. + /// If the codec is not applicable, returns `None`. + /// /// The baseline is uncompressed 64bit data. /// /// It could make sense to also return a value representing /// computational complexity. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option; } #[derive(Debug, Clone)] @@ -152,11 +151,12 @@ mod tests { use crate::blockwise_linear::BlockwiseLinearCodec; use crate::linear::LinearCodec; - pub fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - if !Codec::is_applicable(&data) { - return (f32::MAX, 0.0); - } - let estimation = Codec::estimate(&data); + pub fn create_and_validate( + data: &[u64], + name: &str, + ) -> Option<(f32, f32)> { + let estimation = Codec::estimate(&data)?; + let mut out: Vec = Vec::new(); Codec::serialize(&mut out, &data).unwrap(); @@ -164,16 +164,15 @@ mod tests { let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap(); assert_eq!(reader.num_vals(), data.len() as u64); - for (doc, orig_val) in data.iter().enumerate() { + for (doc, orig_val) in data.iter().copied().enumerate() { let val = reader.get_val(doc as u64); - if val != *orig_val { - panic!( - "val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \ - {data:?}", - ); - } + assert_eq!( + val, orig_val, + "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \ + `{data:?}`", + ); } - (estimation, actual_compression) + Some((estimation, actual_compression)) } proptest! { @@ -193,10 +192,10 @@ mod tests { } - pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { + pub fn get_codec_test_datasets() -> Vec<(Vec, &'static str)> { let mut data_and_names = vec![]; - let data = (10..=20_u64).collect::>(); + let data = (10..=10_000_u64).collect::>(); data_and_names.push((data, "simple monotonically increasing")); data_and_names.push(( @@ -211,12 +210,13 @@ mod tests { fn test_codec() { let codec_name = format!("{:?}", C::CODEC_TYPE); - for (data, dataset_name) in get_codec_test_data_sets() { - let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); - let result = if estimate == f32::MAX { - "Disabled".to_string() - } else { + for (data, dataset_name) in get_codec_test_datasets() { + let estimate_actual_opt: Option<(f32, f32)> = + crate::tests::create_and_validate::(&data, dataset_name); + let result = if let Some((estimate, actual)) = estimate_actual_opt { format!("Estimate `{estimate}` Actual `{actual}`") + } else { + "Disabled".to_string() }; println!("Codec {codec_name}, DataSet {dataset_name}, {result}"); } @@ -240,37 +240,37 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.01); - let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data); + let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap(); assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.32); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(bitpacked_estimation, linear_interpol_estimation); } #[test] fn estimation_test_bad_interpolation_case_monotonically_increasing() { - let mut data = (200..=20000_u64).collect::>(); + let mut data: Vec = (200..=20000_u64).collect(); data.push(1_000_000); // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index d2d53143d7..e49b202d8d 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -192,10 +192,15 @@ impl FastFieldCodec for LinearCodec { footer.serialize(write)?; Ok(()) } - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { + + /// estimation for linear interpolation is hard because, you don't know + /// where the local maxima for the deviation of the calculated value are and + /// the offset to shift all values to >=0 is also unknown. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { if fastfield_accessor.num_vals() < 3 { - return false; // disable compressor for this case + return None; // disable compressor for this case } + // On serialisation the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. @@ -207,14 +212,9 @@ impl FastFieldCodec for LinearCodec { .checked_add(theorethical_maximum_offset) .is_none() { - return false; + return None; } - true - } - /// estimation for linear interpolation is hard because, you don't know - /// where the local maxima for the deviation of the calculated value are and - /// the offset to shift all values to >=0 is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + let first_val = fastfield_accessor.get_val(0); let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals()); @@ -246,7 +246,7 @@ impl FastFieldCodec for LinearCodec { * fastfield_accessor.num_vals() + LinearFooter::SIZE_IN_BYTES as u64; let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } @@ -262,10 +262,10 @@ fn distance + Ord>(x: T, y: T) -> T { #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; - fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> { + crate::tests::create_and_validate::(data, name) } #[test] @@ -292,15 +292,15 @@ mod tests { fn test_compression() { let data = (10..=6_000_u64).collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large"); + create_and_validate(&data, "simple monotonically large").unwrap(); assert!(actual_compression < 0.01); assert!(estimate < 0.01); } #[test] - fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + fn test_with_codec_datasets() { + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); @@ -337,9 +337,10 @@ mod tests { #[test] fn linear_interpol_fast_field_rand() { for _ in 0..5000 { - let mut data = (0..50).map(|_| rand::random::()).collect::>(); + let mut data = (0..10_000) + .map(|_| rand::random::()) + .collect::>(); create_and_validate(&data, "random"); - data.reverse(); create_and_validate(&data, "random"); } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 93204cb250..848392b666 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,5 +1,6 @@ #[macro_use] extern crate prettytable; +use fastfield_codecs::bitpacked::BitpackedCodec; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats}; @@ -12,37 +13,30 @@ fn main() { table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); for (data, data_set_name) in get_codec_test_data_sets() { - let mut results = vec![]; - let res = serialize_with_codec::(&data); - results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); - - // let best_estimation_codec = results - //.iter() - //.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap()) - //.unwrap(); + let results: Vec<(f32, f32, FastFieldCodecType)> = [ + serialize_with_codec::(&data), + serialize_with_codec::(&data), + serialize_with_codec::(&data), + serialize_with_codec::(&data), + ] + .into_iter() + .flatten() + .collect(); let best_compression_ratio_codec = results .iter() - .min_by(|res1, res2| res1.partial_cmp(res2).unwrap()) + .min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap()) .cloned() .unwrap(); table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); - for (is_applicable, est, comp, codec_type) in results { - let (est_cell, ratio_cell) = if !is_applicable { - ("Codec Disabled".to_string(), "".to_string()) - } else { - (est.to_string(), comp.to_string()) - }; + for (est, comp, codec_type) in results { + let est_cell = est.to_string(); + let ratio_cell = comp.to_string(); let style = if comp == best_compression_ratio_codec.1 { "Fb" } else { "" }; - table.add_row(Row::new(vec![ Cell::new(&format!("{codec_type:?}")).style_spec("bFg"), Cell::new(&ratio_cell).style_spec(style), @@ -91,17 +85,12 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { pub fn serialize_with_codec( data: &[u64], -) -> (bool, f32, f32, FastFieldCodecType) { - let is_applicable = C::is_applicable(&data); - if !is_applicable { - return (false, 0.0, 0.0, C::CODEC_TYPE); - } - let estimation = C::estimate(&data); - let mut out = vec![]; +) -> Option<(f32, f32, FastFieldCodecType)> { + let estimation = C::estimate(&data)?; + let mut out = Vec::new(); C::serialize(&mut out, &data).unwrap(); - let actual_compression = out.len() as f32 / (data.len() * 8) as f32; - (true, estimation, actual_compression, C::CODEC_TYPE) + Some((estimation, actual_compression, C::CODEC_TYPE)) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 6bbb33fafd..fbda73b5a5 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -68,11 +68,9 @@ fn codec_estimation( fastfield_accessor: &impl FastFieldDataAccess, estimations: &mut Vec<(f32, FastFieldCodecType)>, ) { - if !C::is_applicable(fastfield_accessor) { - return; + if let Some(ratio) = C::estimate(fastfield_accessor) { + estimations.push((ratio, C::CODEC_TYPE)); } - let ratio = C::estimate(fastfield_accessor); - estimations.push((ratio, C::CODEC_TYPE)); } impl CompositeFastFieldSerializer {