Skip to content

Commit

Permalink
Tests: Improve test coverage in transformers
Browse files Browse the repository at this point in the history
  • Loading branch information
lucianolorenti committed Oct 21, 2023
1 parent 3f51bbc commit 2ce720d
Show file tree
Hide file tree
Showing 10 changed files with 201 additions and 83 deletions.
2 changes: 0 additions & 2 deletions ceruleo/models/keras/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
# flake8: noqa

1 change: 1 addition & 0 deletions ceruleo/models/keras/catalog/MVCNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Permute,
Reshape,
)
from tensorflow.keras import Input, Model


def MVCNN(
Expand Down
4 changes: 2 additions & 2 deletions ceruleo/transformation/features/scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from ceruleo.transformation import TransformerStep
from ceruleo.transformation.features.tdigest import TDigest
from ceruleo.transformation.utils import QuantileComputer, QuantileEstimator
from ceruleo.transformation.utils import QuantileEstimator


class RobustMinMaxScaler(TransformerStep):
Expand Down Expand Up @@ -212,7 +212,7 @@ class RobustStandardScaler(TransformerStep):
def __init__(self, *, quantile_range=(0.25, 0.75), prefer_partial_fit:bool = False, **kwargs):
super().__init__( **kwargs,prefer_partial_fit=prefer_partial_fit)
self.quantile_range = quantile_range
self.quantile_estimator = QuantileComputer()
self.quantile_estimator = QuantileEstimator()
self.IQR = None
self.median = None

Expand Down
25 changes: 15 additions & 10 deletions ceruleo/transformation/features/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def fit(self, X: pd.DataFrame, y=None):
self
"""
self.mean = X.mean()
return self

def partial_fit(self, X: pd.DataFrame, y=None):
"""Compute incrementally the mean of the dataset
Expand Down Expand Up @@ -94,6 +95,7 @@ def fit(self, X: pd.DataFrame, y=None):
self
"""
self.median = X.median()
return self

def partial_fit(self, X: pd.DataFrame, y=None):
"""Compute incrementally the mean of the dataset
Expand Down Expand Up @@ -192,8 +194,8 @@ class Scale(TransformerStep):
Name of the step, by default None
"""

def __init__(self, scale_factor: float, name: Optional[str] = None):
super().__init__(name)
def __init__(self, *, scale_factor: float, name: Optional[str] = None):
super().__init__(name=name)
self.scale_factor = scale_factor

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -249,7 +251,7 @@ class RollingCentering(TransformerStep):
"""

def __init__(self, window: int, min_points: int, name: Optional[str] = None):
def __init__(self, *, window: int, min_points: int, name: Optional[str] = None):
super().__init__(name=name)
self.window = window
self.min_points = min_points
Expand Down Expand Up @@ -309,8 +311,8 @@ class Accumulate(TransformerStep):
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6621413
"""

def __init__(self, normalize: bool = False, *args):
super().__init__(*args)
def __init__(self, *, normalize: bool = False, name: Optional[str] = None):
super().__init__(name=name)
self.normalize = normalize

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -379,8 +381,8 @@ def transform(self, X):
class Apply(TransformerStep):
"""Apply the function element-wise"""

def __init__(self, fun, *args):
super().__init__(*args)
def __init__(self, *, fun, name: Optional[str] = None):
super().__init__(name=name)
self.fun = fun

def transform(self, X):
Expand Down Expand Up @@ -432,13 +434,16 @@ def transform(self, X):
class Peaks(TransformerStep):
"""Peaks"""

def __init__(self, *args):
super().__init__(*args)
distance: float

def __init__(self, *, distance:float, name : Optional[str] = None):
super().__init__(name=name)
self.distance = distance

def transform(self, X):
new_X = pd.DataFrame(np.zeros(X.shape), index=X.index, columns=X.columns)
for i, c in enumerate(X.columns):
peaks_positions, _ = find_peaks(X[c].values, distance=50)
peaks_positions, _ = find_peaks(X[c].values, distance=self.distance)
new_X.iloc[peaks_positions, i] = 1

return new_X
67 changes: 22 additions & 45 deletions ceruleo/transformation/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from concurrent.futures import ProcessPoolExecutor
from typing import List, Optional, Union
from typing import Callable, List, Optional, Union

import numpy as np
import pandas as pd
Expand All @@ -14,9 +14,11 @@ def transform(self, X):


class TransformerLambda(TransformerStep):
def __init__(self, f, name: Optional[str] = None):
super().__init__(name)
self.f = f
def __init__(
self, *, callback: Callable[[pd.DataFrame], pd.DataFrame], name: Optional[str] = None
):
super().__init__(name=name)
self.f = callback

def transform(self, X, y=None):
return self.f(X)
Expand Down Expand Up @@ -89,9 +91,15 @@ def build_tdigest(tdigest, values, column):

class QuantileEstimator:
"""Approximate the quantile of each feature in the dataframe
using t-digest
using t-digest
"""
def __init__(self, tdigest_size:int = 200, max_workers:int = 1, subsample:Optional[Union[int, float]] = None):

def __init__(
self,
tdigest_size: int = 200,
max_workers: int = 1,
subsample: Optional[Union[int, float]] = None,
):
self.tdigest_dict = None
self.tdigest_size = tdigest_size
self.max_workers = max_workers
Expand All @@ -102,19 +110,18 @@ def update(self, X: pd.DataFrame):
return self

columns = X.columns

if self.tdigest_dict is None:
self.tdigest_dict = {c: TDigest(self.tdigest_size) for c in columns}


results = []
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
for i, c in enumerate(columns):

x = X.iloc[:, i].dropna()
if self.subsample is not None:

if isinstance( self.subsample, int):
if isinstance(self.subsample, int):
points_to_sample = self.subsample
else:
points_to_sample = self.subsample * X.shape[0]
Expand All @@ -131,17 +138,17 @@ def update(self, X: pd.DataFrame):

def estimate_quantile(self, *args, **kwargs):
return self.quantile(*args, **kwargs)

def quantile(
self, q: float, feature: Optional[str] = None
) -> Union[pd.Series, float]:
"""Estimate the quantile for a set of features
Parameters
----------
q:float
The quantile to estimate
feature:Optional[Str] """
feature:Optional[Str]"""
if feature is not None:
return self.tdigest_dict[feature].estimate_quantile(q)
else:
Expand All @@ -152,47 +159,17 @@ def quantile(
}
)

class QuantileComputer:
"""Approximate the quantile of each feature in the dataframe
using t-digest
"""
def __init__(self, subsample_rate:float = 1):
self.values = None
self.tdigest_size = subsample_rate

def update(self, X: pd.DataFrame):
if X.shape[0] < 2:
return self

columns = X.columns

if self.values_dict is None:
self.values = X.copy()
else:
self.values = pd.concat(self.values, X)


return self

def quantile(
self, q: float, feature: Optional[str] = None
) -> Union[pd.Series, float]:
if feature is not None:
return self.values.quantile(q)
else:
return self.values[feature].quantile(q)


class Literal(TransformerStep):
def __init__(self, literal, *args):
super().__init__(*args)
self.literal = literal
self.literal = literal

def transform(self, X):
return self.literal


def ensure_step(step):
if isinstance(step, TransformerStep):
return step
return step
return Literal(step)
2 changes: 1 addition & 1 deletion ceruleo/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
def download(URL:str, output_path: Path):
response = requests.get(URL, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 #1 Kibibyte
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(output_path, 'wb') as file:
for data in response.iter_content(block_size):
Expand Down
12 changes: 11 additions & 1 deletion tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ceruleo.models.keras.catalog.CNLSTM import CNLSTM
from ceruleo.models.keras.catalog.InceptionTime import InceptionTime
from ceruleo.models.keras.catalog.MSWRLRCN import MSWRLRCN
from ceruleo.models.keras.catalog.MVCNN import MVCNN
from ceruleo.models.keras.catalog.MultiScaleConvolutional import (
MultiScaleConvolutionalModel,
)
Expand Down Expand Up @@ -283,6 +284,13 @@ def test_catalog(self):
print(type(mmap))
assert isinstance(mmap, np.ndarray)

#model, model_extras = MVCNN(ds_iterator.shape)
#_test_model_basic(model, ds_iterator)
#X, y, sw = next(iter(ds_iterator))
#(mmap, v) = explain(model_extras, X)
#print(type(mmap))
#assert isinstance(mmap, np.ndarray)

def test_baseline(self):
ds = MockDataset(5)
features = ["feature1", "feature2"]
Expand Down Expand Up @@ -388,7 +396,6 @@ def test_losses(self):

_test_model_basic(model, ds_iterator, loss=relative_mse(C=0.5))

print(type(root_mean_squared_error(tf.random.uniform((50,)), tf.random.uniform((50,))).numpy()))
assert isinstance(
root_mean_squared_error(tf.random.uniform((50,)), tf.random.uniform((50,))).numpy(), np.float32
)
Expand All @@ -405,3 +412,6 @@ def test_losses(self):
).numpy(),
np.float32,
)



3 changes: 2 additions & 1 deletion tests/test_scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class TestImputers():

def test_PandasRemoveInf(self):
def test_RobustMinMaxScaler(self):

scaler = RobustMinMaxScaler(range=(-1, 1), clip=False, lower_quantile=0.1, upper_quantile=0.9)
sk_scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(10, 90))
Expand All @@ -31,3 +31,4 @@ def test_PandasRemoveInf(self):

sk_scaler.transform(df1)


Loading

0 comments on commit 2ce720d

Please sign in to comment.