Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reuse samples, add EstimateColumn #1551

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::io::{self, Write};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::column::EstimateColumn;
use crate::serialize::NormalizedHeader;
use crate::{Column, FastFieldCodec, FastFieldCodecType};

Expand Down Expand Up @@ -75,7 +76,7 @@ impl FastFieldCodec for BitpackedCodec {
Ok(())
}

fn estimate(column: &dyn Column) -> Option<f32> {
fn estimate(column: &EstimateColumn) -> Option<f32> {
let num_bits = compute_num_bits(column.max_value());
let num_bits_uncompressed = 64;
Some(num_bits as f32 / num_bits_uncompressed as f32)
Expand Down
3 changes: 2 additions & 1 deletion fastfield_codecs/src/blockwise_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::column::EstimateColumn;
use crate::line::Line;
use crate::serialize::NormalizedHeader;
use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn};
Expand Down Expand Up @@ -71,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
}

// Estimate first_chunk and extrapolate
fn estimate(column: &dyn crate::Column) -> Option<f32> {
fn estimate(column: &EstimateColumn) -> Option<f32> {
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
return None;
}
Expand Down
51 changes: 51 additions & 0 deletions fastfield_codecs/src/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,57 @@ where V: AsRef<[T]> + ?Sized
}
}

// Creates a view over a Column with a limited number of vals. Stats like min max are unchanged
pub struct EstimateColumn<'a> {
column: &'a dyn Column,
num_vals: u64,
}
impl<'a> EstimateColumn<'a> {
pub(crate) fn new(column: &'a dyn Column) -> Self {
let limit_num_vals = column.num_vals().min(100_000);
Self {
column,
num_vals: limit_num_vals,
}
}
}

impl<'a> Column for EstimateColumn<'a> {
fn get_val(&self, idx: u64) -> u64 {
(*self.column).get_val(idx)
}

fn min_value(&self) -> u64 {
(*self.column).min_value()
}

fn max_value(&self) -> u64 {
(*self.column).max_value()
}

fn num_vals(&self) -> u64 {
self.num_vals
}

fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((*self.column).iter().take(self.num_vals as usize))
}

fn get_range(&self, start: u64, output: &mut [u64]) {
(*self.column).get_range(start, output)
}
}

impl<'a> From<&'a dyn Column> for EstimateColumn<'a> {
fn from(column: &'a dyn Column) -> Self {
let limit_num_vals = column.num_vals().min(100_000);
Self {
column,
num_vals: limit_num_vals,
}
}
}

struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
Expand Down
31 changes: 20 additions & 11 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::io;
use std::io::Write;
use std::sync::Arc;

use column::EstimateColumn;
use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
use ownedbytes::OwnedBytes;
Expand Down Expand Up @@ -132,7 +133,7 @@ trait FastFieldCodec: 'static {
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(column: &dyn Column) -> Option<f32>;
fn estimate(column: &EstimateColumn) -> Option<f32>;
}

pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
Expand All @@ -149,6 +150,7 @@ mod tests {

use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::column::EstimateColumn;
use crate::linear::LinearCodec;
use crate::serialize::Header;

Expand All @@ -159,7 +161,9 @@ mod tests {
let col = &VecColumn::from(data);
let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
let normalized_col = header.normalize_column(col);
let estimation = Codec::estimate(&normalized_col)?;

let limited_column = EstimateColumn::new(&normalized_col);
let estimation = Codec::estimate(&limited_column)?;

let mut out = Vec::new();
let col = VecColumn::from(data);
Expand Down Expand Up @@ -280,33 +284,37 @@ mod tests {
let data = (10..=20000_u64).collect::<Vec<_>>();
let data: VecColumn = data.as_slice().into();

let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_le!(linear_interpol_estimation, 0.01);

let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
let multi_linear_interpol_estimation =
BlockwiseLinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);

let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_lt!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];

let data: VecColumn = data.into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_le!(linear_interpol_estimation, 0.34);

let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
}

#[test]
fn estimation_prefer_bitpacked() {
let data = VecColumn::from(&[10, 10, 10, 10]);
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
}

Expand All @@ -318,10 +326,11 @@ mod tests {

// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_le!(linear_interpol_estimation, 0.35);

let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
Expand Down
16 changes: 4 additions & 12 deletions fastfield_codecs/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,11 @@ impl Line {
self.intercept.wrapping_add(linear_part)
}

// Same as train, but the intercept is only estimated from provided sample positions
pub fn estimate(ys: &dyn Column, sample_positions: &[u64]) -> Self {
Self::train_from(
ys,
sample_positions
.iter()
.cloned()
.map(|pos| (pos, ys.get_val(pos))),
)
}

// Intercept is only computed from provided positions
fn train_from(ys: &dyn Column, positions_and_values: impl Iterator<Item = (u64, u64)>) -> Self {
pub fn train_from(
ys: &dyn Column,
positions_and_values: impl Iterator<Item = (u64, u64)>,
) -> Self {
let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) {
num_vals
} else {
Expand Down
13 changes: 7 additions & 6 deletions fastfield_codecs/src/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::column::EstimateColumn;
use crate::line::Line;
use crate::serialize::NormalizedHeader;
use crate::{Column, FastFieldCodec, FastFieldCodecType};
Expand Down Expand Up @@ -121,23 +122,23 @@ impl FastFieldCodec for LinearCodec {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
#[allow(clippy::question_mark)]
fn estimate(column: &dyn Column) -> Option<f32> {
fn estimate(column: &EstimateColumn) -> Option<f32> {
if column.num_vals() < 3 {
return None; // disable compressor for this case
}

// let's sample at 0%, 5%, 10% .. 95%, 100%
let num_vals = column.num_vals() as f32 / 100.0;
let sample_positions = (0..20)
let sample_positions_and_values = (0..20)
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
.map(|pos| (pos, column.get_val(pos)))
.collect::<Vec<_>>();

let line = Line::estimate(column, &sample_positions);
let line = { Line::train_from(column, sample_positions_and_values.iter().cloned()) };

let estimated_bit_width = sample_positions
let estimated_bit_width = sample_positions_and_values
.into_iter()
.map(|pos| {
let actual_value = column.get_val(pos);
.map(|(pos, actual_value)| {
let interpolated_val = line.eval(pos as u64);
actual_value.wrapping_sub(interpolated_val)
})
Expand Down
37 changes: 20 additions & 17 deletions fastfield_codecs/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes;

use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::column::EstimateColumn;
use crate::compact_space::CompactSpaceCompressor;
use crate::linear::LinearCodec;
use crate::{
Expand Down Expand Up @@ -125,23 +126,6 @@ impl BinarySerializable for Header {
}
}

pub fn estimate<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
let column = monotonic_map_column(typed_column, T::to_u64);
let min_value = column.min_value();
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
match codec_type {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
}
}

pub fn serialize_u128(
typed_column: impl Column<u128>,
output: &mut impl io::Write,
Expand Down Expand Up @@ -177,10 +161,29 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
Ok(())
}

pub fn estimate<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
let column = monotonic_map_column(typed_column, T::to_u64);
let min_value = column.min_value();
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
let estimate_column = EstimateColumn::new(&normalized_column);
match codec_type {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&estimate_column),
FastFieldCodecType::Linear => LinearCodec::estimate(&estimate_column),
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&estimate_column),
}
}

fn detect_codec(
column: impl Column<u64>,
codecs: &[FastFieldCodecType],
) -> Option<FastFieldCodecType> {
let column: EstimateColumn = EstimateColumn::new(&column);
let mut estimations = Vec::new();
for &codec in codecs {
let estimation_opt = match codec {
Expand Down