Added comments

quickwit-oss · Aug 26, 2022 · c7a8248 · c7a8248
1 parent d3744cd
commit c7a8248
Showing 1 changed file with 90 additions and 44 deletions.
diff --git a/fastfield_codecs/src/line.rs b/fastfield_codecs/src/line.rs
@@ -1,67 +1,105 @@
-use std::io;
+use std::{io, num::NonZeroU64};
 
 use common::{BinarySerializable, VInt};
 
 use crate::FastFieldDataAccess;
 
 const MID_POINT: u64 = (1u64 << 32) - 1u64;
 
-#[derive(Debug, Clone, Copy)]
+/// `Line` describes a line function `y: ax + b` using integer
+/// arithmetics.
+///
+/// The slope is in fact a decimal split into a 32 bit integer value,
+/// and a 32-bit decimal value.
+///
+/// The multiplication then becomes.
+/// `y = m * x >> 32 + b`
+#[derive(Debug, Clone, Copy, Default)]
 pub struct Line {
     slope: u64,
     intercept: u64,
 }
 
+/// Compute the line slope.
+fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 {
+    let dy = y1.wrapping_sub(y0);
+    let sign = dy < (1 << 32);
+    let abs_dy = if sign {
+        y1.wrapping_sub(y0)
+    } else {
+        y0.wrapping_sub(y1)
+    };
+
+    let abs_slope = (abs_dy << 32) / num_vals.get();
+    if sign {
+        abs_slope
+    } else {
+        u64::MAX - abs_slope
+    }
+}
+
 impl Line {
     #[inline(always)]
     pub fn eval(&self, x: u64) -> u64 {
-        let res = x.wrapping_mul(self.slope).wrapping_shr(32) as i32 as u64;
-        res.wrapping_add(self.intercept)
+        let linear_part = (x.wrapping_mul(self.slope) >> 32) as i32 as u64;
+        self.intercept.wrapping_add(linear_part)
     }
 
-    /// Returns a function such that f(i)
-    /// - computes without panicking for any i..
-    /// - `f(i).wrapping_sub(ys[i])` is small for i in [0..ys.len())
+    /// Returns a line that attemps to approximate a function
+    /// f: i in 0..[ys.num_vals()) -> ys[i].
+    ///
+    /// - The approximation is always lower than the actual value.
+    /// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
+    /// for any i in [0..ys.len()).
+    /// - It computes without panicking for any value of it.
     pub fn train(ys: &dyn FastFieldDataAccess) -> Self {
-        let num_vals = ys.num_vals();
-        assert!(num_vals > 0);
+        let num_vals =
+            if let Some(num_vals) = NonZeroU64::new(ys.num_vals()) {
+                num_vals
+            } else {
+                return Line::default();
+            };
 
         let y0 = ys.get_val(0);
-        let y1 = ys.get_val(num_vals - 1);
-
-        let dy = y1.wrapping_sub(y0);
-        let sign = dy < (1 << 32);
-        let abs_dy = if sign {
-            y1.wrapping_sub(y0)
-        } else {
-            y0.wrapping_sub(y1)
-        };
-
-        let abs_slope = (abs_dy << 32) / (num_vals as u64);
-
-        let slope = if sign {
-            abs_slope
-        } else {
-            u64::MAX - abs_slope
-        };
-
+        let y1 = ys.get_val(num_vals.get() - 1);
+
+        // We first independently pick our slope.
+        let slope = compute_slope(y0, y1, num_vals);
+
+        // We picked our slope. Note that it does not have to be perfect.
+        // Now we need to compute the best intercept.
+        //
+        // Intuitively, the best intercept is such that line passes through one of the
+        // `(i, ys[])`.
+        //
+        // The best intercept therefore has the form
+        // `y[i] - line.eval(i)` (using wrapping arithmetics).
+        // In other words, the best intercept is one of the `y - Line::eval(ys[i])`
+        // and our task is just to pick the one that minimizes our error.
+        //
+        // Without sorting our values, this is a difficult problem.
+        // We however rely on the following trick...
+        //
+        // We only focus on the case where the interpolation is half decent.
+        // If the line interpolation is doing its job on a dataset suited for it,
+        // we can hope that the maximum error won't be larger than `u64::MAX / 2`.
+        //
+        // In other words, even without the intercept the values `y - Line::eval(ys[i])` will all be within
+        // an interval that takes less than half of the modulo space of `u64`.
+        //
+        // Our task is therefore to identify this interval.
+        // Here we simply translate all of our values by `y0 - 2^63` and pick the min.
         let mut line = Line {
             slope,
             intercept: 0,
         };
-
         let heuristic_shift = y0.wrapping_sub(MID_POINT);
-
         line.intercept = ys
             .iter()
             .enumerate()
             .map(|(i, y)| y.wrapping_sub(line.eval(i as u64)))
-            // The shift here is a way to deal with the case where the current line is going below
-            // the vals. This is an heuristic that works perfectly if our values are all
-            // located within an interval of amplitude lesser than 2^63.
             .min_by_key(|&val| val.wrapping_sub(heuristic_shift))
-            .unwrap();
-
+            .unwrap_or(0u64); //< Never happens.
         line
     }
 }
@@ -84,14 +122,22 @@ impl BinarySerializable for Line {
 mod tests {
     use super::*;
 
-    fn test_eval_max_err(ys: &[u64], expected: Option<u64>) {
-        for &shift in &[0, 100, 1 << 63, u64::MAX, u64::MAX - 10, u64::MAX - 12] {
-            assert_eq!(test_eval_max_err_with_shift(ys, shift), expected);
+    /// Test training a line and ensuring that the maximum difference between
+    /// the data points and the line is `expected`.
+    ///
+    /// This function operates translation over the data for better coverage.
+    fn test_line_interpol_with_translation(ys: &[u64], expected: Option<u64>) {
+        let mut translations =  vec![0, 100, u64::MAX, u64::MAX - 1];
+        translations.extend_from_slice(ys);
+        for translation in translations {
+            let translated_ys: Vec<u64> = ys.iter().copied().map(|y| y.wrapping_add(translation)).collect();
+            let largest_err = test_eval_max_err(&translated_ys);
+            assert_eq!(largest_err, expected);
+
         }
     }
 
-    fn test_eval_max_err_with_shift(ys: &[u64], shift: u64) -> Option<u64> {
-        let ys: Vec<u64> = ys.iter().copied().map(|y| y.wrapping_add(shift)).collect();
+    fn test_eval_max_err(ys: &[u64]) -> Option<u64> {
         let line = Line::train(&&ys[..]);
         ys.iter()
             .enumerate()
@@ -101,10 +147,10 @@ mod tests {
 
     #[test]
     fn test_train() {
-        test_eval_max_err(&[11, 11, 11, 12, 12, 13], Some(1));
-        test_eval_max_err(&[13, 12, 12, 11, 11, 11], Some(0));
-        test_eval_max_err(&[13, 13, 12, 11, 11, 11], Some(1));
-        test_eval_max_err(&[13, 13, 12, 11, 11, 11], Some(1));
-        test_eval_max_err(&[u64::MAX - 1, 0, 0, 1], Some(2));
+        test_line_interpol_with_translation(&[11, 11, 11, 12, 12, 13], Some(1));
+        test_line_interpol_with_translation(&[13, 12, 12, 11, 11, 11], Some(0));
+        test_line_interpol_with_translation(&[13, 13, 12, 11, 11, 11], Some(1));
+        test_line_interpol_with_translation(&[13, 13, 12, 11, 11, 11], Some(1));
+        test_line_interpol_with_translation(&[u64::MAX - 1, 0, 0, 1], Some(2));
     }
 }