From ba3215b469c32aafef4ecd91ed3953364b0f84c6 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Sun, 25 Sep 2022 22:29:42 +0800
Subject: [PATCH] reuse samples, add EstimateColumn

estimations can be expensive since the samples span the whole column
and depending on the implementation get_val can not be easily computed
without an index.
EstimateColumn adds a view over the column which limits num_vals
to 100_000.
---
 fastfield_codecs/src/bitpacked.rs        |  3 +-
 fastfield_codecs/src/blockwise_linear.rs |  3 +-
 fastfield_codecs/src/column.rs           | 51 ++++++++++++++++++++++++
 fastfield_codecs/src/lib.rs              | 31 +++++++++-----
 fastfield_codecs/src/line.rs             | 16 ++------
 fastfield_codecs/src/linear.rs           | 13 +++---
 fastfield_codecs/src/serialize.rs        | 37 +++++++++--------
 7 files changed, 106 insertions(+), 48 deletions(-)
diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs
index f91993a66e..de038787b0 100644
--- a/fastfield_codecs/src/bitpacked.rs
+++ b/fastfield_codecs/src/bitpacked.rs
@@ -3,6 +3,7 @@ use std::io::{self, Write};
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
 
+use crate::column::EstimateColumn;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType};
 
@@ -75,7 +76,7 @@ impl FastFieldCodec for BitpackedCodec {
         Ok(())
     }
 
-    fn estimate(column: &dyn Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
         let num_bits = compute_num_bits(column.max_value());
         let num_bits_uncompressed = 64;
         Some(num_bits as f32 / num_bits_uncompressed as f32)
diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs
index 360f5c3ade..a638805a97 100644
--- a/fastfield_codecs/src/blockwise_linear.rs
+++ b/fastfield_codecs/src/blockwise_linear.rs
@@ -5,6 +5,7 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom};
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
 
+use crate::column::EstimateColumn;
 use crate::line::Line;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn};
@@ -71,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
     }
 
     // Estimate first_chunk and extrapolate
-    fn estimate(column: &dyn crate::Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
         if column.num_vals() < 10 * CHUNK_SIZE as u64 {
             return None;
         }
diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs
index b40f76b692..0a9e179028 100644
--- a/fastfield_codecs/src/column.rs
+++ b/fastfield_codecs/src/column.rs
@@ -137,6 +137,57 @@ where V: AsRef<[T]> + ?Sized
     }
 }
 
+// Creates a view over a Column with a limited number of vals. Stats like min max are unchanged
+pub struct EstimateColumn<'a> {
+    column: &'a dyn Column,
+    num_vals: u64,
+}
+impl<'a> EstimateColumn<'a> {
+    pub(crate) fn new(column: &'a dyn Column) -> Self {
+        let limit_num_vals = column.num_vals().min(100_000);
+        Self {
+            column,
+            num_vals: limit_num_vals,
+        }
+    }
+}
+
+impl<'a> Column for EstimateColumn<'a> {
+    fn get_val(&self, idx: u64) -> u64 {
+        (*self.column).get_val(idx)
+    }
+
+    fn min_value(&self) -> u64 {
+        (*self.column).min_value()
+    }
+
+    fn max_value(&self) -> u64 {
+        (*self.column).max_value()
+    }
+
+    fn num_vals(&self) -> u64 {
+        self.num_vals
+    }
+
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new((*self.column).iter().take(self.num_vals as usize))
+    }
+
+    fn get_range(&self, start: u64, output: &mut [u64]) {
+        (*self.column).get_range(start, output)
+    }
+}
+
+impl<'a> From<&'a dyn Column> for EstimateColumn<'a> {
+    fn from(column: &'a dyn Column) -> Self {
+        let limit_num_vals = column.num_vals().min(100_000);
+        Self {
+            column,
+            num_vals: limit_num_vals,
+        }
+    }
+}
+
 struct MonotonicMappingColumn<C, T, Input> {
     from_column: C,
     monotonic_mapping: T,
diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs
index 4205a323a5..96b2e2119c 100644
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -11,6 +11,7 @@ use std::io;
 use std::io::Write;
 use std::sync::Arc;
 
+use column::EstimateColumn;
 use common::BinarySerializable;
 use compact_space::CompactSpaceDecompressor;
 use ownedbytes::OwnedBytes;
@@ -132,7 +133,7 @@ trait FastFieldCodec: 'static {
     ///
     /// It could make sense to also return a value representing
     /// computational complexity.
-    fn estimate(column: &dyn Column) -> Option<f32>;
+    fn estimate(column: &EstimateColumn) -> Option<f32>;
 }
 
 pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
@@ -149,6 +150,7 @@ mod tests {
 
     use crate::bitpacked::BitpackedCodec;
     use crate::blockwise_linear::BlockwiseLinearCodec;
+    use crate::column::EstimateColumn;
     use crate::linear::LinearCodec;
     use crate::serialize::Header;
 
@@ -159,7 +161,9 @@ mod tests {
         let col = &VecColumn::from(data);
         let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
         let normalized_col = header.normalize_column(col);
-        let estimation = Codec::estimate(&normalized_col)?;
+
+        let limited_column = EstimateColumn::new(&normalized_col);
+        let estimation = Codec::estimate(&limited_column)?;
 
         let mut out = Vec::new();
         let col = VecColumn::from(data);
@@ -280,14 +284,16 @@ mod tests {
         let data = (10..=20000_u64).collect::<Vec<_>>();
         let data: VecColumn = data.as_slice().into();
 
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_le!(linear_interpol_estimation, 0.01);
 
-        let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
+        let multi_linear_interpol_estimation =
+            BlockwiseLinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_le!(multi_linear_interpol_estimation, 0.2);
         assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_lt!(linear_interpol_estimation, bitpacked_estimation);
     }
     #[test]
@@ -295,18 +301,20 @@ mod tests {
         let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
 
         let data: VecColumn = data.into();
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_le!(linear_interpol_estimation, 0.34);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_lt!(bitpacked_estimation, linear_interpol_estimation);
     }
 
     #[test]
     fn estimation_prefer_bitpacked() {
         let data = VecColumn::from(&[10, 10, 10, 10]);
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_lt!(bitpacked_estimation, linear_interpol_estimation);
     }
 
@@ -318,10 +326,11 @@ mod tests {
 
         // in this case the linear interpolation can't in fact not be worse than bitpacking,
         // but the estimator adds some threshold, which leads to estimated worse behavior
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_le!(linear_interpol_estimation, 0.35);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
         assert_le!(bitpacked_estimation, 0.32);
         assert_le!(bitpacked_estimation, linear_interpol_estimation);
     }
diff --git a/fastfield_codecs/src/line.rs b/fastfield_codecs/src/line.rs
index c1eb558e57..e26abf9a68 100644
--- a/fastfield_codecs/src/line.rs
+++ b/fastfield_codecs/src/line.rs
@@ -67,19 +67,11 @@ impl Line {
         self.intercept.wrapping_add(linear_part)
     }
 
-    // Same as train, but the intercept is only estimated from provided sample positions
-    pub fn estimate(ys: &dyn Column, sample_positions: &[u64]) -> Self {
-        Self::train_from(
-            ys,
-            sample_positions
-                .iter()
-                .cloned()
-                .map(|pos| (pos, ys.get_val(pos))),
-        )
-    }
-
     // Intercept is only computed from provided positions
-    fn train_from(ys: &dyn Column, positions_and_values: impl Iterator<Item = (u64, u64)>) -> Self {
+    pub fn train_from(
+        ys: &dyn Column,
+        positions_and_values: impl Iterator<Item = (u64, u64)>,
+    ) -> Self {
         let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) {
             num_vals
         } else {
diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs
index ad6f923a1a..ec6bd1d3e7 100644
--- a/fastfield_codecs/src/linear.rs
+++ b/fastfield_codecs/src/linear.rs
@@ -4,6 +4,7 @@ use common::BinarySerializable;
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
 
+use crate::column::EstimateColumn;
 use crate::line::Line;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType};
@@ -121,23 +122,23 @@ impl FastFieldCodec for LinearCodec {
     /// where the local maxima for the deviation of the calculated value are and
     /// the offset to shift all values to >=0 is also unknown.
     #[allow(clippy::question_mark)]
-    fn estimate(column: &dyn Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
         if column.num_vals() < 3 {
             return None; // disable compressor for this case
         }
 
         // let's sample at 0%, 5%, 10% .. 95%, 100%
         let num_vals = column.num_vals() as f32 / 100.0;
-        let sample_positions = (0..20)
+        let sample_positions_and_values = (0..20)
             .map(|pos| (num_vals * pos as f32 * 5.0) as u64)
+            .map(|pos| (pos, column.get_val(pos)))
             .collect::<Vec<_>>();
 
-        let line = Line::estimate(column, &sample_positions);
+        let line = { Line::train_from(column, sample_positions_and_values.iter().cloned()) };
 
-        let estimated_bit_width = sample_positions
+        let estimated_bit_width = sample_positions_and_values
             .into_iter()
-            .map(|pos| {
-                let actual_value = column.get_val(pos);
+            .map(|(pos, actual_value)| {
                 let interpolated_val = line.eval(pos as u64);
                 actual_value.wrapping_sub(interpolated_val)
             })
diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs
index 92f55f5d0f..46c6d188b8 100644
--- a/fastfield_codecs/src/serialize.rs
+++ b/fastfield_codecs/src/serialize.rs
@@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes;
 
 use crate::bitpacked::BitpackedCodec;
 use crate::blockwise_linear::BlockwiseLinearCodec;
+use crate::column::EstimateColumn;
 use crate::compact_space::CompactSpaceCompressor;
 use crate::linear::LinearCodec;
 use crate::{
@@ -125,23 +126,6 @@ impl BinarySerializable for Header {
     }
 }
 
-pub fn estimate<T: MonotonicallyMappableToU64>(
-    typed_column: impl Column<T>,
-    codec_type: FastFieldCodecType,
-) -> Option<f32> {
-    let column = monotonic_map_column(typed_column, T::to_u64);
-    let min_value = column.min_value();
-    let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
-        .filter(|gcd| gcd.get() > 1u64);
-    let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
-    let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
-    match codec_type {
-        FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
-        FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
-        FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
-    }
-}
-
 pub fn serialize_u128(
     typed_column: impl Column<u128>,
     output: &mut impl io::Write,
@@ -177,10 +161,29 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
     Ok(())
 }
 
+pub fn estimate<T: MonotonicallyMappableToU64>(
+    typed_column: impl Column<T>,
+    codec_type: FastFieldCodecType,
+) -> Option<f32> {
+    let column = monotonic_map_column(typed_column, T::to_u64);
+    let min_value = column.min_value();
+    let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
+        .filter(|gcd| gcd.get() > 1u64);
+    let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
+    let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
+    let estimate_column = EstimateColumn::new(&normalized_column);
+    match codec_type {
+        FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&estimate_column),
+        FastFieldCodecType::Linear => LinearCodec::estimate(&estimate_column),
+        FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&estimate_column),
+    }
+}
+
 fn detect_codec(
     column: impl Column<u64>,
     codecs: &[FastFieldCodecType],
 ) -> Option<FastFieldCodecType> {
+    let column: EstimateColumn = EstimateColumn::new(&column);
     let mut estimations = Vec::new();
     for &codec in codecs {
         let estimation_opt = match codec {