Skip to content

Commit

Permalink
Merge pull request #103 from predict-idlab/lttbv2_cp
Browse files Browse the repository at this point in the history
Lttbv2 🍒 ⛏️  branch
  • Loading branch information
jvdd authored Jul 17, 2022
2 parents 1e4e069 + 06c4a33 commit c8de84b
Show file tree
Hide file tree
Showing 7 changed files with 732 additions and 473 deletions.
84 changes: 84 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import shutil
import sys

from distutils.command.build_ext import build_ext
from distutils.core import Distribution
from distutils.core import Extension
from distutils.errors import CCompilerError
from distutils.errors import DistutilsExecError
from distutils.errors import DistutilsPlatformError

import numpy as np

# C Extensions
with_extensions = True


def get_script_path():
return os.path.dirname(os.path.realpath(sys.argv[0]))

extensions = []
if with_extensions:
extensions = [
Extension(
name="plotly_resampler.aggregation.algorithms.lttbcv2",
sources=["plotly_resampler/aggregation/algorithms/lttbcv2.c"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
include_dirs=[np.get_include(), get_script_path()],
),
]


class BuildFailed(Exception):

pass


class ExtBuilder(build_ext):
# This class allows C extension building to fail.

built_extensions = []

def run(self):
try:
build_ext.run(self)
except (DistutilsPlatformError, FileNotFoundError) as e:
print(" Unable to build the C extensions.")
raise e

def build_extension(self, ext):
try:
build_ext.build_extension(self, ext)
except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError) as e:
print(' Unable to build the "{}" C extension, '.format(ext.name))
raise e


def build(setup_kwargs):
"""
This function is mandatory in order to build the extensions.
"""
distribution = Distribution({"name": "plotly_resampler", "ext_modules": extensions})
distribution.package_dir = "plotly_resampler"

cmd = ExtBuilder(distribution)
cmd.ensure_finalized()
cmd.run()

# Copy built extensions back to the project
for output in cmd.get_outputs():
relative_extension = os.path.relpath(output, cmd.build_lib)
if not os.path.exists(output):
continue

shutil.copyfile(output, relative_extension)
mode = os.stat(relative_extension).st_mode
mode |= (mode & 0o444) >> 2
os.chmod(relative_extension, mode)

return setup_kwargs


if __name__ == "__main__":
build({})
2 changes: 1 addition & 1 deletion plotly_resampler/aggregation/aggregation_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _insert_gap_none(self, s: pd.Series) -> pd.Series:
df_gap_idx = s.index.values[s_idx_diff > 3 * med_diff]
if len(df_gap_idx):
df_res_gap = pd.Series(
index=df_gap_idx, data=None, name=s.name, copy=False
index=df_gap_idx, data=None, name=s.name, copy=False, dtype=s.dtype
)

if isinstance(df_res_gap.index, pd.DatetimeIndex):
Expand Down
61 changes: 19 additions & 42 deletions plotly_resampler/aggregation/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

import math

import lttbc
import numpy as np
import pandas as pd

from ..aggregation.aggregation_interface import AbstractSeriesAggregator
from .algorithms import lttbcv2


class LTTB(AbstractSeriesAggregator):
Expand Down Expand Up @@ -73,42 +73,19 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
)

def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# if we have categorical data, LTTB will convert the categorical values into
# their numeric codes, i.e., the index position of the category array
s_v = s.cat.codes.values if str(s.dtype) == "category" else s.values
s_i = s.index.values

if s_i.dtype.type == np.datetime64:
# lttbc does not support this datatype -> convert to int
# (where the time is represented in ns)
# REMARK:
# -> additional logic is needed to mitigate rounding errors
# First, the start offset is subtracted, after which the input series
# is set in the already requested format, i.e. np.float64

# NOTE -> Rounding errors can still persist, but this approach is already
# significantly less prone to it than the previos implementation.
s_i0 = s_i[0].astype(np.int64)
idx, data = lttbc.downsample(
(s_i.astype(np.int64) - s_i0).astype(np.float64), s_v, n_out
)

# add the start-offset and convert back to datetime
idx = pd.to_datetime(
idx.astype(np.int64) + s_i0, unit="ns", utc=True
).tz_convert(s.index.tz)
else:
idx, data = lttbc.downsample(s_i, s_v, n_out)
idx = idx.astype(s_i.dtype)
s_i = s.index.values
s_i = s_i.astype(np.int64) if s_i.dtype.type == np.datetime64 else s_i

if str(s.dtype) == "category":
# reconvert the downsampled numeric codes to the category array
data = np.vectorize(s.dtype.categories.values.item)(data.astype(s_v.dtype))
else:
# default case, use the series it's dtype as return type
data = data.astype(s.dtype)
index = lttbcv2.downsample_return_index(s_i, s_v, n_out)

return pd.Series(index=idx, data=data, name=str(s.name), copy=False)
return pd.Series(
index=s.index[index],
data=s.values[index],
name=str(s.name),
copy=False,
)


class MinMaxOverlapAggregator(AbstractSeriesAggregator):
Expand Down Expand Up @@ -166,14 +143,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# Calculate the argmin & argmax on the reshaped view of `s` &
# add the corresponding offset
argmin = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmin(axis=1)
+ offset
)
argmax = (
s.iloc[argmax_offset : block_size * offset.shape[0] + argmax_offset]
.values.reshape(-1, block_size)
s.values[argmax_offset : block_size * offset.shape[0] + argmax_offset]
.reshape(-1, block_size)
.argmax(axis=1)
+ offset
+ argmax_offset
Expand Down Expand Up @@ -231,14 +208,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# Calculate the argmin & argmax on the reshaped view of `s` &
# add the corresponding offset
argmin = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmin(axis=1)
+ offset
)
argmax = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmax(axis=1)
+ offset
)
Expand Down Expand Up @@ -297,7 +274,7 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
)

def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
if s.shape[0] > n_out * 1_000:
if s.shape[0] > n_out * 2_000:
s = self.minmax._aggregate(s, n_out * 50)
return self.lttb._aggregate(s, n_out)

Expand Down
165 changes: 165 additions & 0 deletions plotly_resampler/aggregation/algorithms/lttbcv2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#define PY_SSIZE_T_CLEAN
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <Python.h> // pull the python API into the current namespace
#include <numpy/arrayobject.h>
#include <numpy/npy_math.h>
#include <math.h>

// This code is adapted from https://github.com/dgoeries/lttbc
// Most credits are due to https://github.com/dgoeries

// method below assumes the x-delta's are equidistant
static PyObject* downsample_return_index(PyObject *self, PyObject *args) {
int threshold;
PyObject *x_obj = NULL, *y_obj = NULL;
PyArrayObject *x_array = NULL, *y_array = NULL;

if (!PyArg_ParseTuple(args, "OOi", &x_obj, &y_obj, &threshold))
return NULL;

if ((!PyArray_Check(x_obj) && !PyList_Check(x_obj)) || (!PyArray_Check(y_obj) && !PyList_Check(y_obj))) {
PyErr_SetString(PyExc_TypeError, "Function requires x and y input to be of type list or ndarray ...");
goto fail;
}


// Interpret the input objects as numpy arrays, with reqs (contiguous, aligned, and writeable ...)
x_array = (PyArrayObject *)PyArray_FROM_OTF(x_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
y_array = (PyArrayObject *)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
if (x_array == NULL || y_array == NULL) {
goto fail;
}

if (PyArray_NDIM(x_array) != 1 || PyArray_NDIM(y_array) != 1) {;
PyErr_SetString(PyExc_ValueError, "Both x and y must have a single dimension ...");
goto fail;
}

if (!PyArray_SAMESHAPE(x_array, y_array)) {
PyErr_SetString(PyExc_ValueError, "Input x and y must have the same shape ...");
goto fail;
}

// Declare data length and check if we actually have to downsample!
const Py_ssize_t data_length = (Py_ssize_t)PyArray_DIM(y_array, 0);
if (threshold >= data_length || threshold <= 0) {
// Nothing to do!
PyObject *value = Py_BuildValue("O", x_array);
Py_DECREF(x_array);
Py_DECREF(y_array);
return value;
}

// Access the data in the NDArray!
double *y = (double*)PyArray_DATA(y_array);

// Create an empty output array with shape and dim for the output!
npy_intp dims[1];
dims[0] = threshold;
PyArrayObject *sampled_x = (PyArrayObject *)PyArray_Empty(1, dims,
PyArray_DescrFromType(NPY_INT64), 0);

// Get a pointer to its data
long long *sampled_x_data = (long long*)PyArray_DATA(sampled_x);

// The main loop here!
Py_ssize_t sampled_index = 0;
const double every = (double)(data_length - 2) / (threshold - 2);

Py_ssize_t a = 0;
Py_ssize_t next_a = 0;

// Always add the first point!
sampled_x_data[sampled_index] = 0;

sampled_index++;
Py_ssize_t i;
for (i = 0; i < threshold - 2; ++i) {
// Calculate point average for next bucket (containing c)
double avg_x = threshold;
double avg_y = 0;
Py_ssize_t avg_range_start = (Py_ssize_t)(floor((i + 1)* every) + 1);
Py_ssize_t avg_range_end = (Py_ssize_t)(floor((i + 2) * every) + 1);
if (avg_range_end >= data_length){
avg_range_end = data_length;
}
Py_ssize_t avg_range_length = avg_range_end - avg_range_start;

for (;avg_range_start < avg_range_end; avg_range_start++){
avg_y += y[avg_range_start];
}
avg_y /= avg_range_length;
avg_x = avg_range_start + every / 2;

// Get the range for this bucket
Py_ssize_t range_offs = (Py_ssize_t)(floor((i + 0) * every) + 1);
Py_ssize_t range_to = (Py_ssize_t)(floor((i + 1) * every) + 1);


// Point a
double point_a_y = y[a];

double max_area = -1.0;
for (; range_offs < range_to; range_offs++){
// Calculate triangle area over three buckets
double area = fabs((a - avg_x) * (y[range_offs] - point_a_y) - (a - range_offs) * (avg_y - point_a_y)) * 0.5;
if (area > max_area){
max_area = area;
next_a = range_offs; // Next a is this b
}
}
// Pick this point from the bucket
sampled_x_data[sampled_index] = next_a;

sampled_index++;

// Current a becomes the next_a (chosen b)
a = next_a;
}

// Always add last! Check for finite values!
sampled_x_data[sampled_index] = data_length - 1;

// Provide our return value
PyObject *value = Py_BuildValue("O", sampled_x);

// And remove the references!
Py_DECREF(x_array);
Py_DECREF(y_array);
Py_XDECREF(sampled_x);

return value;

fail:
Py_DECREF(x_array);
Py_XDECREF(y_array);
return NULL;
}


// Method definition object
static PyMethodDef lttbcv2Methods[] = {
{
"downsample_return_index", // The name of the method
downsample_return_index, // Function pointer to the method implementation
METH_VARARGS,
"Compute the largest triangle three buckets (LTTB) algorithm in a C extension."
},
{NULL, NULL, 0, NULL} /* Sentinel */
};

static struct PyModuleDef lttbc_module_definition = {
PyModuleDef_HEAD_INIT,
"lttbcv2", /* name of module */
"A Python module that computes the largest triangle three buckets algorithm (LTTB) using C code.",
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
lttbcv2Methods
};

// Module initialization
PyMODINIT_FUNC PyInit_lttbcv2(void) {
Py_Initialize();
import_array();
return PyModule_Create(&lttbc_module_definition);
}
Loading

0 comments on commit c8de84b

Please sign in to comment.