diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index f91993a66e..de038787b0 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -3,6 +3,7 @@ use std::io::{self, Write}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; +use crate::column::EstimateColumn; use crate::serialize::NormalizedHeader; use crate::{Column, FastFieldCodec, FastFieldCodecType}; @@ -75,7 +76,7 @@ impl FastFieldCodec for BitpackedCodec { Ok(()) } - fn estimate(column: &dyn Column) -> Option { + fn estimate(column: &EstimateColumn) -> Option { let num_bits = compute_num_bits(column.max_value()); let num_bits_uncompressed = 64; Some(num_bits as f32 / num_bits_uncompressed as f32) diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 360f5c3ade..a638805a97 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -5,6 +5,7 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; +use crate::column::EstimateColumn; use crate::line::Line; use crate::serialize::NormalizedHeader; use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn}; @@ -71,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec { } // Estimate first_chunk and extrapolate - fn estimate(column: &dyn crate::Column) -> Option { + fn estimate(column: &EstimateColumn) -> Option { if column.num_vals() < 10 * CHUNK_SIZE as u64 { return None; } diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index b40f76b692..0a9e179028 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -137,6 +137,57 @@ where V: AsRef<[T]> + ?Sized } } +// Creates a view over a Column with a limited number of vals. Stats like min max are unchanged +pub struct EstimateColumn<'a> { + column: &'a dyn Column, + num_vals: u64, +} +impl<'a> EstimateColumn<'a> { + pub(crate) fn new(column: &'a dyn Column) -> Self { + let limit_num_vals = column.num_vals().min(100_000); + Self { + column, + num_vals: limit_num_vals, + } + } +} + +impl<'a> Column for EstimateColumn<'a> { + fn get_val(&self, idx: u64) -> u64 { + (*self.column).get_val(idx) + } + + fn min_value(&self) -> u64 { + (*self.column).min_value() + } + + fn max_value(&self) -> u64 { + (*self.column).max_value() + } + + fn num_vals(&self) -> u64 { + self.num_vals + } + + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new((*self.column).iter().take(self.num_vals as usize)) + } + + fn get_range(&self, start: u64, output: &mut [u64]) { + (*self.column).get_range(start, output) + } +} + +impl<'a> From<&'a dyn Column> for EstimateColumn<'a> { + fn from(column: &'a dyn Column) -> Self { + let limit_num_vals = column.num_vals().min(100_000); + Self { + column, + num_vals: limit_num_vals, + } + } +} + struct MonotonicMappingColumn { from_column: C, monotonic_mapping: T, diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 4205a323a5..96b2e2119c 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -11,6 +11,7 @@ use std::io; use std::io::Write; use std::sync::Arc; +use column::EstimateColumn; use common::BinarySerializable; use compact_space::CompactSpaceDecompressor; use ownedbytes::OwnedBytes; @@ -132,7 +133,7 @@ trait FastFieldCodec: 'static { /// /// It could make sense to also return a value representing /// computational complexity. - fn estimate(column: &dyn Column) -> Option; + fn estimate(column: &EstimateColumn) -> Option; } pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [ @@ -149,6 +150,7 @@ mod tests { use crate::bitpacked::BitpackedCodec; use crate::blockwise_linear::BlockwiseLinearCodec; + use crate::column::EstimateColumn; use crate::linear::LinearCodec; use crate::serialize::Header; @@ -159,7 +161,9 @@ mod tests { let col = &VecColumn::from(data); let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?; let normalized_col = header.normalize_column(col); - let estimation = Codec::estimate(&normalized_col)?; + + let limited_column = EstimateColumn::new(&normalized_col); + let estimation = Codec::estimate(&limited_column)?; let mut out = Vec::new(); let col = VecColumn::from(data); @@ -280,14 +284,16 @@ mod tests { let data = (10..=20000_u64).collect::>(); let data: VecColumn = data.as_slice().into(); - let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); + let linear_interpol_estimation = + LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_le!(linear_interpol_estimation, 0.01); - let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap(); + let multi_linear_interpol_estimation = + BlockwiseLinearCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_le!(multi_linear_interpol_estimation, 0.2); assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); + let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_lt!(linear_interpol_estimation, bitpacked_estimation); } #[test] @@ -295,18 +301,20 @@ mod tests { let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20]; let data: VecColumn = data.into(); - let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); + let linear_interpol_estimation = + LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_le!(linear_interpol_estimation, 0.34); - let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); + let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_lt!(bitpacked_estimation, linear_interpol_estimation); } #[test] fn estimation_prefer_bitpacked() { let data = VecColumn::from(&[10, 10, 10, 10]); - let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); - let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); + let linear_interpol_estimation = + LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap(); + let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_lt!(bitpacked_estimation, linear_interpol_estimation); } @@ -318,10 +326,11 @@ mod tests { // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); + let linear_interpol_estimation = + LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); + let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap(); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/line.rs b/fastfield_codecs/src/line.rs index c1eb558e57..e26abf9a68 100644 --- a/fastfield_codecs/src/line.rs +++ b/fastfield_codecs/src/line.rs @@ -67,19 +67,11 @@ impl Line { self.intercept.wrapping_add(linear_part) } - // Same as train, but the intercept is only estimated from provided sample positions - pub fn estimate(ys: &dyn Column, sample_positions: &[u64]) -> Self { - Self::train_from( - ys, - sample_positions - .iter() - .cloned() - .map(|pos| (pos, ys.get_val(pos))), - ) - } - // Intercept is only computed from provided positions - fn train_from(ys: &dyn Column, positions_and_values: impl Iterator) -> Self { + pub fn train_from( + ys: &dyn Column, + positions_and_values: impl Iterator, + ) -> Self { let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) { num_vals } else { diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index ad6f923a1a..ec6bd1d3e7 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -4,6 +4,7 @@ use common::BinarySerializable; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; +use crate::column::EstimateColumn; use crate::line::Line; use crate::serialize::NormalizedHeader; use crate::{Column, FastFieldCodec, FastFieldCodecType}; @@ -121,23 +122,23 @@ impl FastFieldCodec for LinearCodec { /// where the local maxima for the deviation of the calculated value are and /// the offset to shift all values to >=0 is also unknown. #[allow(clippy::question_mark)] - fn estimate(column: &dyn Column) -> Option { + fn estimate(column: &EstimateColumn) -> Option { if column.num_vals() < 3 { return None; // disable compressor for this case } // let's sample at 0%, 5%, 10% .. 95%, 100% let num_vals = column.num_vals() as f32 / 100.0; - let sample_positions = (0..20) + let sample_positions_and_values = (0..20) .map(|pos| (num_vals * pos as f32 * 5.0) as u64) + .map(|pos| (pos, column.get_val(pos))) .collect::>(); - let line = Line::estimate(column, &sample_positions); + let line = { Line::train_from(column, sample_positions_and_values.iter().cloned()) }; - let estimated_bit_width = sample_positions + let estimated_bit_width = sample_positions_and_values .into_iter() - .map(|pos| { - let actual_value = column.get_val(pos); + .map(|(pos, actual_value)| { let interpolated_val = line.eval(pos as u64); actual_value.wrapping_sub(interpolated_val) }) diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 92f55f5d0f..46c6d188b8 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes; use crate::bitpacked::BitpackedCodec; use crate::blockwise_linear::BlockwiseLinearCodec; +use crate::column::EstimateColumn; use crate::compact_space::CompactSpaceCompressor; use crate::linear::LinearCodec; use crate::{ @@ -125,23 +126,6 @@ impl BinarySerializable for Header { } } -pub fn estimate( - typed_column: impl Column, - codec_type: FastFieldCodecType, -) -> Option { - let column = monotonic_map_column(typed_column, T::to_u64); - let min_value = column.min_value(); - let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) - .filter(|gcd| gcd.get() > 1u64); - let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value)); - match codec_type { - FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column), - FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column), - FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column), - } -} - pub fn serialize_u128( typed_column: impl Column, output: &mut impl io::Write, @@ -177,10 +161,29 @@ pub fn serialize( Ok(()) } +pub fn estimate( + typed_column: impl Column, + codec_type: FastFieldCodecType, +) -> Option { + let column = monotonic_map_column(typed_column, T::to_u64); + let min_value = column.min_value(); + let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) + .filter(|gcd| gcd.get() > 1u64); + let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); + let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value)); + let estimate_column = EstimateColumn::new(&normalized_column); + match codec_type { + FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&estimate_column), + FastFieldCodecType::Linear => LinearCodec::estimate(&estimate_column), + FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&estimate_column), + } +} + fn detect_codec( column: impl Column, codecs: &[FastFieldCodecType], ) -> Option { + let column: EstimateColumn = EstimateColumn::new(&column); let mut estimations = Vec::new(); for &codec in codecs { let estimation_opt = match codec {