Merge pull request #103 from predict-idlab/lttbv2_cp

Lttbv2 🍒 ⛏️ branch
predict-idlab · Jul 17, 2022 · c8de84b · c8de84b
2 parents 1e4e069 + 06c4a33
commit c8de84b
Show file tree

Hide file tree

Showing 7 changed files with 732 additions and 473 deletions.
diff --git a/build.py b/build.py
@@ -0,0 +1,84 @@
+import os
+import shutil
+import sys
+
+from distutils.command.build_ext import build_ext
+from distutils.core import Distribution
+from distutils.core import Extension
+from distutils.errors import CCompilerError
+from distutils.errors import DistutilsExecError
+from distutils.errors import DistutilsPlatformError
+
+import numpy as np
+
+# C Extensions
+with_extensions = True
+
+
+def get_script_path():
+    return os.path.dirname(os.path.realpath(sys.argv[0]))
+
+extensions = []
+if with_extensions:
+    extensions = [
+        Extension(
+            name="plotly_resampler.aggregation.algorithms.lttbcv2",
+            sources=["plotly_resampler/aggregation/algorithms/lttbcv2.c"],
+            define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+            include_dirs=[np.get_include(), get_script_path()],
+        ),
+    ]
+
+
+class BuildFailed(Exception):
+
+    pass
+
+
+class ExtBuilder(build_ext):
+    # This class allows C extension building to fail.
+
+    built_extensions = []
+
+    def run(self):
+        try:
+            build_ext.run(self)
+        except (DistutilsPlatformError, FileNotFoundError) as e:
+            print("   Unable to build the C extensions.")
+            raise e
+
+    def build_extension(self, ext):
+        try:
+            build_ext.build_extension(self, ext)
+        except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError) as e:
+            print('   Unable to build the "{}" C extension, '.format(ext.name))
+            raise e
+
+
+def build(setup_kwargs):
+    """
+    This function is mandatory in order to build the extensions.
+    """
+    distribution = Distribution({"name": "plotly_resampler", "ext_modules": extensions})
+    distribution.package_dir = "plotly_resampler"
+
+    cmd = ExtBuilder(distribution)
+    cmd.ensure_finalized()
+    cmd.run()
+
+    # Copy built extensions back to the project
+    for output in cmd.get_outputs():
+        relative_extension = os.path.relpath(output, cmd.build_lib)
+        if not os.path.exists(output):
+            continue
+
+        shutil.copyfile(output, relative_extension)
+        mode = os.stat(relative_extension).st_mode
+        mode |= (mode & 0o444) >> 2
+        os.chmod(relative_extension, mode)
+
+    return setup_kwargs
+
+
+if __name__ == "__main__":
+    build({})
diff --git a/plotly_resampler/aggregation/aggregation_interface.py b/plotly_resampler/aggregation/aggregation_interface.py
@@ -95,7 +95,7 @@ def _insert_gap_none(self, s: pd.Series) -> pd.Series:
             df_gap_idx = s.index.values[s_idx_diff > 3 * med_diff]
             if len(df_gap_idx):
                 df_res_gap = pd.Series(
-                    index=df_gap_idx, data=None, name=s.name, copy=False
+                    index=df_gap_idx, data=None, name=s.name, copy=False, dtype=s.dtype
                 )
 
                 if isinstance(df_res_gap.index, pd.DatetimeIndex):

diff --git a/plotly_resampler/aggregation/aggregators.py b/plotly_resampler/aggregation/aggregators.py
@@ -11,11 +11,11 @@
 
 import math
 
-import lttbc
 import numpy as np
 import pandas as pd
 
 from ..aggregation.aggregation_interface import AbstractSeriesAggregator
+from .algorithms import lttbcv2
 
 
 class LTTB(AbstractSeriesAggregator):
@@ -73,42 +73,19 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
         )
 
     def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
-        # if we have categorical data, LTTB will convert the categorical values into
-        # their numeric codes, i.e., the index position of the category array
         s_v = s.cat.codes.values if str(s.dtype) == "category" else s.values
-        s_i = s.index.values
-
-        if s_i.dtype.type == np.datetime64:
-            # lttbc does not support this datatype -> convert to int
-            # (where the time is represented in ns)
-            # REMARK:
-            #   -> additional logic is needed to mitigate rounding errors 
-            #   First, the start offset is subtracted, after which the input series
-            #   is set in the already requested format, i.e. np.float64
-
-            # NOTE -> Rounding errors can still persist, but this approach is already
-            #         significantly less prone to it than the previos implementation.
-            s_i0 = s_i[0].astype(np.int64)
-            idx, data = lttbc.downsample(
-                (s_i.astype(np.int64) - s_i0).astype(np.float64), s_v, n_out
-            )
 
-            # add the start-offset and convert back to datetime
-            idx = pd.to_datetime(
-                idx.astype(np.int64) + s_i0, unit="ns", utc=True
-            ).tz_convert(s.index.tz)
-        else:
-            idx, data = lttbc.downsample(s_i, s_v, n_out)
-            idx = idx.astype(s_i.dtype)
+        s_i = s.index.values
+        s_i = s_i.astype(np.int64) if s_i.dtype.type == np.datetime64 else s_i
 
-        if str(s.dtype) == "category":
-            # reconvert the downsampled numeric codes to the category array
-            data = np.vectorize(s.dtype.categories.values.item)(data.astype(s_v.dtype))
-        else:
-            # default case, use the series it's dtype as return type
-            data = data.astype(s.dtype)
+        index = lttbcv2.downsample_return_index(s_i, s_v, n_out)
 
-        return pd.Series(index=idx, data=data, name=str(s.name), copy=False)
+        return pd.Series(
+            index=s.index[index],
+            data=s.values[index],
+            name=str(s.name),
+            copy=False,
+        )
 
 
 class MinMaxOverlapAggregator(AbstractSeriesAggregator):
@@ -166,14 +143,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
         # Calculate the argmin & argmax on the reshaped view of `s` &
         # add the corresponding offset
         argmin = (
-            s.iloc[: block_size * offset.shape[0]]
-            .values.reshape(-1, block_size)
+            s.values[: block_size * offset.shape[0]]
+            .reshape(-1, block_size)
             .argmin(axis=1)
             + offset
         )
         argmax = (
-            s.iloc[argmax_offset : block_size * offset.shape[0] + argmax_offset]
-            .values.reshape(-1, block_size)
+            s.values[argmax_offset : block_size * offset.shape[0] + argmax_offset]
+            .reshape(-1, block_size)
             .argmax(axis=1)
             + offset
             + argmax_offset
@@ -231,14 +208,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
         # Calculate the argmin & argmax on the reshaped view of `s` &
         # add the corresponding offset
         argmin = (
-            s.iloc[: block_size * offset.shape[0]]
-            .values.reshape(-1, block_size)
+            s.values[: block_size * offset.shape[0]]
+            .reshape(-1, block_size)
             .argmin(axis=1)
             + offset
         )
         argmax = (
-            s.iloc[: block_size * offset.shape[0]]
-            .values.reshape(-1, block_size)
+            s.values[: block_size * offset.shape[0]]
+            .reshape(-1, block_size)
             .argmax(axis=1)
             + offset
         )
@@ -297,7 +274,7 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
         )
 
     def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
-        if s.shape[0] > n_out * 1_000:
+        if s.shape[0] > n_out * 2_000:
             s = self.minmax._aggregate(s, n_out * 50)
         return self.lttb._aggregate(s, n_out)
 

diff --git a/plotly_resampler/aggregation/algorithms/lttbcv2.c b/plotly_resampler/aggregation/algorithms/lttbcv2.c
@@ -0,0 +1,165 @@
+#define PY_SSIZE_T_CLEAN
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <Python.h>  // pull the python API into the current namespace
+#include <numpy/arrayobject.h>
+#include <numpy/npy_math.h>
+#include <math.h>
+
+// This code is adapted from https://github.com/dgoeries/lttbc
+// Most credits are due to https://github.com/dgoeries
+
+// method below assumes the x-delta's are equidistant
+static PyObject* downsample_return_index(PyObject *self, PyObject *args) {
+    int threshold;
+    PyObject *x_obj = NULL, *y_obj = NULL;
+    PyArrayObject *x_array = NULL, *y_array = NULL;
+
+    if (!PyArg_ParseTuple(args, "OOi", &x_obj, &y_obj, &threshold))
+        return NULL;
+
+    if ((!PyArray_Check(x_obj) && !PyList_Check(x_obj)) || (!PyArray_Check(y_obj) && !PyList_Check(y_obj))) {
+        PyErr_SetString(PyExc_TypeError, "Function requires x and y input to be of type list or ndarray ...");
+        goto fail;
+    }
+
+
+    // Interpret the input objects as numpy arrays, with reqs (contiguous, aligned, and writeable ...)
+    x_array = (PyArrayObject *)PyArray_FROM_OTF(x_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    y_array = (PyArrayObject *)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    if (x_array == NULL || y_array == NULL) {
+        goto fail;
+    }
+
+    if (PyArray_NDIM(x_array) != 1 || PyArray_NDIM(y_array) != 1) {;
+        PyErr_SetString(PyExc_ValueError, "Both x and y must have a single dimension ...");
+        goto fail;
+    }
+
+    if (!PyArray_SAMESHAPE(x_array, y_array)) {
+        PyErr_SetString(PyExc_ValueError, "Input x and y must have the same shape ...");
+        goto fail;
+    }
+
+    // Declare data length and check if we actually have to downsample!
+    const Py_ssize_t data_length = (Py_ssize_t)PyArray_DIM(y_array, 0);
+    if (threshold >= data_length || threshold <= 0) {
+        // Nothing to do!
+        PyObject *value = Py_BuildValue("O", x_array);
+        Py_DECREF(x_array);
+        Py_DECREF(y_array);
+        return value;
+    }
+
+    // Access the data in the NDArray!
+    double *y = (double*)PyArray_DATA(y_array);
+
+    // Create an empty output array with shape and dim for the output!
+    npy_intp dims[1];
+    dims[0] = threshold;
+    PyArrayObject *sampled_x = (PyArrayObject *)PyArray_Empty(1, dims,
+        PyArray_DescrFromType(NPY_INT64), 0);
+
+    // Get a pointer to its data
+    long long *sampled_x_data = (long long*)PyArray_DATA(sampled_x);
+
+    // The main loop here!
+    Py_ssize_t sampled_index = 0;
+    const double every = (double)(data_length - 2) / (threshold - 2);
+
+    Py_ssize_t a = 0;
+    Py_ssize_t next_a = 0;
+
+    // Always add the first point!
+    sampled_x_data[sampled_index] = 0;
+
+    sampled_index++;
+    Py_ssize_t i;
+    for (i = 0; i < threshold - 2; ++i) {
+        // Calculate point average for next bucket (containing c)
+        double avg_x = threshold;
+        double avg_y = 0;
+        Py_ssize_t avg_range_start = (Py_ssize_t)(floor((i + 1)* every) + 1);
+        Py_ssize_t avg_range_end = (Py_ssize_t)(floor((i + 2) * every) + 1);
+        if (avg_range_end >= data_length){
+            avg_range_end = data_length;
+        }
+        Py_ssize_t avg_range_length = avg_range_end - avg_range_start;
+
+        for (;avg_range_start < avg_range_end; avg_range_start++){
+            avg_y += y[avg_range_start];
+        }
+        avg_y /= avg_range_length;
+        avg_x = avg_range_start + every / 2;
+
+        // Get the range for this bucket
+        Py_ssize_t range_offs = (Py_ssize_t)(floor((i + 0) * every) + 1);
+        Py_ssize_t range_to = (Py_ssize_t)(floor((i + 1) * every) + 1);
+
+
+        // Point a
+        double point_a_y = y[a];
+
+        double max_area = -1.0;
+        for (; range_offs < range_to; range_offs++){
+            // Calculate triangle area over three buckets
+            double area = fabs((a - avg_x) * (y[range_offs] - point_a_y) - (a - range_offs) * (avg_y - point_a_y)) * 0.5;
+            if (area > max_area){
+                max_area = area;
+                next_a = range_offs; // Next a is this b
+            }
+        }
+        // Pick this point from the bucket
+        sampled_x_data[sampled_index] = next_a;
+
+        sampled_index++;
+
+        // Current a becomes the next_a (chosen b)
+        a = next_a;
+    }
+
+    // Always add last! Check for finite values!
+    sampled_x_data[sampled_index] = data_length - 1;
+
+    // Provide our return value
+    PyObject *value = Py_BuildValue("O", sampled_x);
+
+    // And remove the references!
+    Py_DECREF(x_array);
+    Py_DECREF(y_array);
+    Py_XDECREF(sampled_x);
+
+    return value;
+
+fail:
+    Py_DECREF(x_array);
+    Py_XDECREF(y_array);
+    return NULL;
+}
+
+
+// Method definition object
+static PyMethodDef lttbcv2Methods[] = {
+    {
+        "downsample_return_index",  // The name of the method
+        downsample_return_index,    // Function pointer to the method implementation
+        METH_VARARGS,
+        "Compute the largest triangle three buckets (LTTB) algorithm in a C extension."
+    },
+    {NULL, NULL, 0, NULL}  /* Sentinel */
+};
+
+static struct PyModuleDef lttbc_module_definition = {
+    PyModuleDef_HEAD_INIT,
+    "lttbcv2",  /* name of module */
+    "A Python module that computes the largest triangle three buckets algorithm (LTTB) using C code.",
+    -1,         /* size of per-interpreter state of the module,
+                   or -1 if the module keeps state in global variables. */
+    lttbcv2Methods
+};
+
+// Module initialization
+PyMODINIT_FUNC PyInit_lttbcv2(void) {
+    Py_Initialize();
+    import_array();
+    return PyModule_Create(&lttbc_module_definition);
+}