Introduce comments to store.rs, fuzz tests

This commit adds the last bit of polish to store.rs in terms of commenting on its function. We also introduce fuzz tests for inspecting the CKMS implementation. -max_len=160 ought to be in place for the existing fuzz test. Signed-off-by: Brian L. Troutwine <[email protected]>
postmates · Dec 11, 2017 · e027030 · e027030
1 parent 2193949
commit e027030
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 35 deletions.
diff --git a/benches/ckms.rs b/benches/ckms.rs
@@ -4,28 +4,27 @@ extern crate test;
 extern crate quantiles;
 
 mod ckms {
-
-#[derive(Debug, Clone, Copy)]
-pub struct Xorshift {
-    seed: u64,
-}
-
-impl Xorshift {
-    pub fn new(seed: u64) -> Xorshift {
-        Xorshift { seed: seed }
+    #[derive(Debug, Clone, Copy)]
+    pub struct Xorshift {
+        seed: u64,
     }
-
-    pub fn next_val(&mut self) -> u32 {
-        // implementation inspired by
-        // https://github.com/astocko/xorshift/blob/master/src/splitmix64.rs
-        use std::num::Wrapping as w;
-
-        let mut z = w(self.seed) + w(0x9E37_79B9_7F4A_7C15_u64);
-        let nxt_seed = z.0;
-        z = (z ^ (z >> 30)) * w(0xBF58_476D_1CE4_E5B9_u64);
-        z = (z ^ (z >> 27)) * w(0x94D0_49BB_1331_11EB_u64);
-        self.seed = nxt_seed;
-        u32::from((z ^ (z >> 31)).0 as u16)
+
+    impl Xorshift {
+        pub fn new(seed: u64) -> Xorshift {
+            Xorshift { seed: seed }
+        }
+
+        pub fn next_val(&mut self) -> u32 {
+            // implementation inspired by
+            // https://github.com/astocko/xorshift/blob/master/src/splitmix64.rs
+            use std::num::Wrapping as w;
+
+            let mut z = w(self.seed) + w(0x9E37_79B9_7F4A_7C15_u64);
+            let nxt_seed = z.0;
+            z = (z ^ (z >> 30)) * w(0xBF58_476D_1CE4_E5B9_u64);
+            z = (z ^ (z >> 27)) * w(0x94D0_49BB_1331_11EB_u64);
+            self.seed = nxt_seed;
+            u32::from((z ^ (z >> 31)).0 as u16)
         }
     }
 

diff --git a/fuzz/.gitignore b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+
+target
+corpus
+artifacts
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "quantiles-fuzz"
+version = "0.0.1"
+authors = ["Automatically generated"]
+publish = false
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+byteorder = "1.2"
+
+[dependencies.quantiles]
+path = ".."
+[dependencies.libfuzzer-sys]
+git = "https://github.com/rust-fuzz/libfuzzer-sys.git"
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "ckms_u32"
+path = "fuzz_targets/ckms_u32.rs"
diff --git a/fuzz/fuzz_targets/ckms_u32.rs b/fuzz/fuzz_targets/ckms_u32.rs
@@ -0,0 +1,61 @@
+#![no_main]
+#[macro_use] extern crate libfuzzer_sys;
+extern crate quantiles;
+extern crate byteorder;
+
+use std::io::Cursor;
+use byteorder::{BigEndian, ReadBytesExt};
+
+#[derive(Debug, Clone, Copy)]
+pub struct Xorshift {
+    seed: u64,
+}
+
+impl Xorshift {
+    pub fn new(seed: u64) -> Xorshift {
+        Xorshift { seed: seed }
+    }
+
+    pub fn next_val(&mut self) -> u32 {
+        // implementation inspired by
+        // https://github.com/astocko/xorshift/blob/master/src/splitmix64.rs
+        use std::num::Wrapping as w;
+
+        let mut z = w(self.seed) + w(0x9E37_79B9_7F4A_7C15_u64);
+        let nxt_seed = z.0;
+        z = (z ^ (z >> 30)) * w(0xBF58_476D_1CE4_E5B9_u64);
+        z = (z ^ (z >> 27)) * w(0x94D0_49BB_1331_11EB_u64);
+        self.seed = nxt_seed;
+        u32::from((z ^ (z >> 31)).0 as u16)
+    }
+}
+
+fuzz_target!(|data: &[u8]| {
+    let mut cursor = Cursor::new(data);
+
+    // unbounded, CKMS will adjust to within bounds 
+    let error: f64 = if let Ok(res) = cursor.read_f64::<BigEndian>() {
+        res
+    } else {
+        return;
+    };
+    // bounded 2**24
+    let upper_bound: u32 = if let Ok(res) = cursor.read_u32::<BigEndian>() {
+        res % 16_777_216
+    } else {
+        return;
+    };
+    // unbounded
+    let seed: u64 = if let Ok(res) = cursor.read_u64::<BigEndian>() {
+        res
+    } else {
+        return;
+    };
+
+    let mut xshft = Xorshift::new(seed);
+    let mut ckms = quantiles::ckms::CKMS::<u32>::new(error);
+    for _ in 0..(upper_bound as usize) {
+        let val = xshft.next_val();
+        ckms.insert(val);
+    }
+});
diff --git a/src/ckms/store.rs b/src/ckms/store.rs
@@ -1,8 +1,46 @@
+//! store - a 'poor man's skiplist' for CKMS
+//!
+//! The CKMS requires a storage data structure that has cheap inserts and
+//! constant-ish loookups. That's because:
+//!
+//!  * insertion implies search
+//!  * compression implies search, shifting of data
+//!  * query is search
+//!
+//! Prior to 0.7 CKMS used a Vec for storing its samples. This worked well
+//! enough for small collections of samples, but fell over once you got past
+//! around 50k on account of the expense of shifting data around, performing a
+//! search for every insert.
+//!
+//! What we've done in this module is build a "poor man's" skiplist --
+//! constructed of nested Vecs of bounded sized -- which contains at each 'node'
+//! all the information we need to perform an insert, as if we had examined
+//! every sample in the block. Anyhow, we'll get into it below.
 use std::fmt;
 use std::ops::{Index, IndexMut};
 
 use ckms::entry::Entry;
 
+/// The all-important CKMS invariant.
+pub fn invariant(r: f64, error: f64) -> u32 {
+    let i = (2.0 * error * r).floor() as u32;
+    if i == 0 {
+        1
+    } else {
+        i
+    }
+}
+
+/// Inner is the 'node' of our poor-man's skiplist. Each Inner stores a block of
+/// samples of bounded size -- controlled by `inner_cap` set on `Store` -- and a
+/// `g_sum`. The CKMS algorithm requires samples to be stored in sorted order
+/// and the insertion procedure builds up a `g_sum`, a sum of the ranks of each
+/// sample seen. To avoid re-computing this `g_sum` as we search for our
+/// insertion spot we instead keep a `g_sum` on Inner, meaning if we determine
+/// that the block will not be inserted into we can just pull `g_sum` and avoid
+/// inspecting every sample in the block. This implies a touch more work on
+/// every insertion but we come out well ahead not inspecting every sample, even
+/// so.
 #[derive(Clone, PartialEq, Debug)]
 #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))]
 pub struct Inner<T>
@@ -13,15 +51,6 @@ where
     g_sum: u32,
 }
 
-pub fn invariant(r: f64, error: f64) -> u32 {
-    let i = (2.0 * error * r).floor() as u32;
-    if i == 0 {
-        1
-    } else {
-        i
-    }
-}
-
 impl<T> Inner<T>
 where
     T: PartialEq + PartialOrd + Copy,
@@ -30,6 +59,14 @@ where
         self.data.len()
     }
 
+    /// split_off is patterned on the operation of `Vec::split_off`.
+    ///
+    /// The notion here is when an Inner goes over `Store::inner_cap` we need to
+    /// split the samples that fall over `inner_cap` into a new Inner. This
+    /// keeps our `inner_cap` bound going and reduces the O(n) burden of
+    /// inserting into a Vec.
+    ///
+    /// The correct `g_sum` is maintained for both sides in the split.
     pub fn split_off(&mut self, index: usize) -> Self {
         assert!(index < self.data.len());
         let nxt = self.data.split_off(index);
@@ -48,14 +85,16 @@ pub struct Store<T>
 where
     T: PartialEq,
 {
-    // We aim for the full biased quantiles method. The paper this
-    // implementation is based on includes a 'targeted' method but the authors
-    // have granted that it is flawed in private communication. As such, all
-    // queries for all quantiles will have the same error factor.
+    /// The way CKMS works, we are allowed to respond back to a user query
+    /// inaccurately. Just, with known inaccuracy. That's what this is, the
+    /// known inaccuracy. What's neat is we can perform compression on the
+    /// stored samples so long as we keep within this error bound.
     pub error: f64,
 
+    /// Our collction of samples. See documentation of Inner for more details.
     pub data: Vec<Inner<T>>,
-    inner_cap: usize,
+
+    inner_cap: usize, // maximum size of an Inner
     len: usize, // samples currently stored
     n: usize,   // total samples ever stored
 }
@@ -79,10 +118,26 @@ where
         }
     }
 
+    /// Insert a point into the Store
     pub fn insert(&mut self, element: T) -> ()
     where
         T: fmt::Debug,
     {
+        // This function is a touch repetative. There are three possible
+        // conditions when we insert a point:
+        //
+        //  * point goes to the front
+        //  * point goes to the back
+        //  * point goes somewhere in the middle
+        //
+        // Insertion into the middle is the most expensive. A point inserted at
+        // the front or back has a 'delta' -- see the referenced paper for full
+        // details -- of 0. A point that goes into the middle has a delta
+        // derived from the invariant, the rank of the inserted sample. That
+        // implies a search. This store is able to skip a linear seek by
+        // examining the max element of Inner caches, their associated maximum
+        // rank, g_sum.
+
         // insert at the front
         if self.data[0].data.is_empty() || (self.data[0].data[0].v >= element) {
             self.data[0].data.insert(