From 2ce720def2fe3941fb846444c5197447880b279a Mon Sep 17 00:00:00 2001 From: Luciano Lorenti Date: Sat, 21 Oct 2023 16:17:30 +0200 Subject: [PATCH] Tests: Improve test coverage in transformers --- ceruleo/models/keras/__init__.py | 2 - ceruleo/models/keras/catalog/MVCNN.py | 1 + ceruleo/transformation/features/scalers.py | 4 +- .../transformation/features/transformation.py | 25 +-- ceruleo/transformation/utils.py | 67 +++---- ceruleo/utils/download.py | 2 +- tests/test_models.py | 12 +- tests/test_scalers.py | 3 +- tests/test_transformers.py | 164 +++++++++++++++--- tests/test_utils.py | 4 + 10 files changed, 201 insertions(+), 83 deletions(-) diff --git a/ceruleo/models/keras/__init__.py b/ceruleo/models/keras/__init__.py index efa62527..e69de29b 100644 --- a/ceruleo/models/keras/__init__.py +++ b/ceruleo/models/keras/__init__.py @@ -1,2 +0,0 @@ -# flake8: noqa - diff --git a/ceruleo/models/keras/catalog/MVCNN.py b/ceruleo/models/keras/catalog/MVCNN.py index 02c7c4eb..54c297f4 100644 --- a/ceruleo/models/keras/catalog/MVCNN.py +++ b/ceruleo/models/keras/catalog/MVCNN.py @@ -9,6 +9,7 @@ Permute, Reshape, ) +from tensorflow.keras import Input, Model def MVCNN( diff --git a/ceruleo/transformation/features/scalers.py b/ceruleo/transformation/features/scalers.py index 9920f270..f98a6629 100644 --- a/ceruleo/transformation/features/scalers.py +++ b/ceruleo/transformation/features/scalers.py @@ -4,7 +4,7 @@ import pandas as pd from ceruleo.transformation import TransformerStep from ceruleo.transformation.features.tdigest import TDigest -from ceruleo.transformation.utils import QuantileComputer, QuantileEstimator +from ceruleo.transformation.utils import QuantileEstimator class RobustMinMaxScaler(TransformerStep): @@ -212,7 +212,7 @@ class RobustStandardScaler(TransformerStep): def __init__(self, *, quantile_range=(0.25, 0.75), prefer_partial_fit:bool = False, **kwargs): super().__init__( **kwargs,prefer_partial_fit=prefer_partial_fit) self.quantile_range = quantile_range - self.quantile_estimator = QuantileComputer() + self.quantile_estimator = QuantileEstimator() self.IQR = None self.median = None diff --git a/ceruleo/transformation/features/transformation.py b/ceruleo/transformation/features/transformation.py index 9663dd0b..80278838 100644 --- a/ceruleo/transformation/features/transformation.py +++ b/ceruleo/transformation/features/transformation.py @@ -31,6 +31,7 @@ def fit(self, X: pd.DataFrame, y=None): self """ self.mean = X.mean() + return self def partial_fit(self, X: pd.DataFrame, y=None): """Compute incrementally the mean of the dataset @@ -94,6 +95,7 @@ def fit(self, X: pd.DataFrame, y=None): self """ self.median = X.median() + return self def partial_fit(self, X: pd.DataFrame, y=None): """Compute incrementally the mean of the dataset @@ -192,8 +194,8 @@ class Scale(TransformerStep): Name of the step, by default None """ - def __init__(self, scale_factor: float, name: Optional[str] = None): - super().__init__(name) + def __init__(self, *, scale_factor: float, name: Optional[str] = None): + super().__init__(name=name) self.scale_factor = scale_factor def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -249,7 +251,7 @@ class RollingCentering(TransformerStep): """ - def __init__(self, window: int, min_points: int, name: Optional[str] = None): + def __init__(self, *, window: int, min_points: int, name: Optional[str] = None): super().__init__(name=name) self.window = window self.min_points = min_points @@ -309,8 +311,8 @@ class Accumulate(TransformerStep): https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6621413 """ - def __init__(self, normalize: bool = False, *args): - super().__init__(*args) + def __init__(self, *, normalize: bool = False, name: Optional[str] = None): + super().__init__(name=name) self.normalize = normalize def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -379,8 +381,8 @@ def transform(self, X): class Apply(TransformerStep): """Apply the function element-wise""" - def __init__(self, fun, *args): - super().__init__(*args) + def __init__(self, *, fun, name: Optional[str] = None): + super().__init__(name=name) self.fun = fun def transform(self, X): @@ -432,13 +434,16 @@ def transform(self, X): class Peaks(TransformerStep): """Peaks""" - def __init__(self, *args): - super().__init__(*args) + distance: float + + def __init__(self, *, distance:float, name : Optional[str] = None): + super().__init__(name=name) + self.distance = distance def transform(self, X): new_X = pd.DataFrame(np.zeros(X.shape), index=X.index, columns=X.columns) for i, c in enumerate(X.columns): - peaks_positions, _ = find_peaks(X[c].values, distance=50) + peaks_positions, _ = find_peaks(X[c].values, distance=self.distance) new_X.iloc[peaks_positions, i] = 1 return new_X diff --git a/ceruleo/transformation/utils.py b/ceruleo/transformation/utils.py index 1b8d1f08..f8b09a3c 100644 --- a/ceruleo/transformation/utils.py +++ b/ceruleo/transformation/utils.py @@ -1,5 +1,5 @@ from concurrent.futures import ProcessPoolExecutor -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import numpy as np import pandas as pd @@ -14,9 +14,11 @@ def transform(self, X): class TransformerLambda(TransformerStep): - def __init__(self, f, name: Optional[str] = None): - super().__init__(name) - self.f = f + def __init__( + self, *, callback: Callable[[pd.DataFrame], pd.DataFrame], name: Optional[str] = None + ): + super().__init__(name=name) + self.f = callback def transform(self, X, y=None): return self.f(X) @@ -89,9 +91,15 @@ def build_tdigest(tdigest, values, column): class QuantileEstimator: """Approximate the quantile of each feature in the dataframe - using t-digest + using t-digest """ - def __init__(self, tdigest_size:int = 200, max_workers:int = 1, subsample:Optional[Union[int, float]] = None): + + def __init__( + self, + tdigest_size: int = 200, + max_workers: int = 1, + subsample: Optional[Union[int, float]] = None, + ): self.tdigest_dict = None self.tdigest_size = tdigest_size self.max_workers = max_workers @@ -102,11 +110,10 @@ def update(self, X: pd.DataFrame): return self columns = X.columns - + if self.tdigest_dict is None: self.tdigest_dict = {c: TDigest(self.tdigest_size) for c in columns} - results = [] with ProcessPoolExecutor(max_workers=self.max_workers) as executor: for i, c in enumerate(columns): @@ -114,7 +121,7 @@ def update(self, X: pd.DataFrame): x = X.iloc[:, i].dropna() if self.subsample is not None: - if isinstance( self.subsample, int): + if isinstance(self.subsample, int): points_to_sample = self.subsample else: points_to_sample = self.subsample * X.shape[0] @@ -131,17 +138,17 @@ def update(self, X: pd.DataFrame): def estimate_quantile(self, *args, **kwargs): return self.quantile(*args, **kwargs) - + def quantile( self, q: float, feature: Optional[str] = None ) -> Union[pd.Series, float]: """Estimate the quantile for a set of features - + Parameters ---------- q:float The quantile to estimate - feature:Optional[Str] """ + feature:Optional[Str]""" if feature is not None: return self.tdigest_dict[feature].estimate_quantile(q) else: @@ -152,47 +159,17 @@ def quantile( } ) -class QuantileComputer: - """Approximate the quantile of each feature in the dataframe - using t-digest - """ - def __init__(self, subsample_rate:float = 1): - self.values = None - self.tdigest_size = subsample_rate - - def update(self, X: pd.DataFrame): - if X.shape[0] < 2: - return self - - columns = X.columns - - if self.values_dict is None: - self.values = X.copy() - else: - self.values = pd.concat(self.values, X) - - - return self - - def quantile( - self, q: float, feature: Optional[str] = None - ) -> Union[pd.Series, float]: - if feature is not None: - return self.values.quantile(q) - else: - return self.values[feature].quantile(q) - class Literal(TransformerStep): def __init__(self, literal, *args): super().__init__(*args) - self.literal = literal - + self.literal = literal + def transform(self, X): return self.literal def ensure_step(step): if isinstance(step, TransformerStep): - return step + return step return Literal(step) diff --git a/ceruleo/utils/download.py b/ceruleo/utils/download.py index 0a387eed..1e006aaf 100644 --- a/ceruleo/utils/download.py +++ b/ceruleo/utils/download.py @@ -5,7 +5,7 @@ def download(URL:str, output_path: Path): response = requests.get(URL, stream=True) total_size_in_bytes= int(response.headers.get('content-length', 0)) - block_size = 1024 #1 Kibibyte + block_size = 1024 progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) with open(output_path, 'wb') as file: for data in response.iter_content(block_size): diff --git a/tests/test_models.py b/tests/test_models.py index c3893de7..112cfd77 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,6 +12,7 @@ from ceruleo.models.keras.catalog.CNLSTM import CNLSTM from ceruleo.models.keras.catalog.InceptionTime import InceptionTime from ceruleo.models.keras.catalog.MSWRLRCN import MSWRLRCN +from ceruleo.models.keras.catalog.MVCNN import MVCNN from ceruleo.models.keras.catalog.MultiScaleConvolutional import ( MultiScaleConvolutionalModel, ) @@ -283,6 +284,13 @@ def test_catalog(self): print(type(mmap)) assert isinstance(mmap, np.ndarray) + #model, model_extras = MVCNN(ds_iterator.shape) + #_test_model_basic(model, ds_iterator) + #X, y, sw = next(iter(ds_iterator)) + #(mmap, v) = explain(model_extras, X) + #print(type(mmap)) + #assert isinstance(mmap, np.ndarray) + def test_baseline(self): ds = MockDataset(5) features = ["feature1", "feature2"] @@ -388,7 +396,6 @@ def test_losses(self): _test_model_basic(model, ds_iterator, loss=relative_mse(C=0.5)) - print(type(root_mean_squared_error(tf.random.uniform((50,)), tf.random.uniform((50,))).numpy())) assert isinstance( root_mean_squared_error(tf.random.uniform((50,)), tf.random.uniform((50,))).numpy(), np.float32 ) @@ -405,3 +412,6 @@ def test_losses(self): ).numpy(), np.float32, ) + + + diff --git a/tests/test_scalers.py b/tests/test_scalers.py index a152cd7a..1d5f0906 100644 --- a/tests/test_scalers.py +++ b/tests/test_scalers.py @@ -8,7 +8,7 @@ class TestImputers(): - def test_PandasRemoveInf(self): + def test_RobustMinMaxScaler(self): scaler = RobustMinMaxScaler(range=(-1, 1), clip=False, lower_quantile=0.1, upper_quantile=0.9) sk_scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(10, 90)) @@ -31,3 +31,4 @@ def test_PandasRemoveInf(self): sk_scaler.transform(df1) + diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 7f8a7227..d6630b75 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -6,31 +6,26 @@ import scipy.stats from ceruleo.dataset.ts_dataset import AbstractTimeSeriesDataset +from ceruleo.iterators.iterators import RelativeToEnd, RelativeToStart from ceruleo.transformation.features.entropy import LocalEntropyMeasures from ceruleo.transformation.features.extraction import ( - EMD, - ChangesDetector, - Difference, - ExpandingStatistics, - OneHotCategorical, - RollingStatistics, - SimpleEncodingCategorical, - SlidingNonOverlappingEMD, -) + EMD, ChangesDetector, Difference, ExpandingStatistics, OneHotCategorical, + RollingStatistics, SimpleEncodingCategorical, SlidingNonOverlappingEMD) from ceruleo.transformation.features.outliers import ( - EWMAOutOfRange, - IQROutlierRemover, - IsolationForestOutlierRemover, - ZScoreOutlierRemover, -) -from ceruleo.transformation.features.resamplers import IntegerIndexResamplerTransformer -from ceruleo.transformation.features.selection import ( - ByNameFeatureSelector, - NullProportionSelector, -) -from ceruleo.transformation.features.transformation import Accumulate + EWMAOutOfRange, IQROutlierRemover, IsolationForestOutlierRemover, + ZScoreOutlierRemover) +from ceruleo.transformation.features.resamplers import \ + IntegerIndexResamplerTransformer +from ceruleo.transformation.features.selection import (ByNameFeatureSelector, + NullProportionSelector) +from ceruleo.transformation.features.slicing import SliceRows +from ceruleo.transformation.features.transformation import ( + Accumulate, Apply, Clip, Diff, ExpandingCentering, ExpandingNormalization, MeanCentering, + MedianCentering, Peaks, RollingCentering, Scale, Sqrt, Square, StringConcatenate, SubstractLinebase) from ceruleo.transformation.functional.pipeline.pipeline import Pipeline -from ceruleo.transformation.utils import QuantileEstimator +from ceruleo.transformation.utils import (IdentityTransformerStep, + PandasToNumpy, QuantileEstimator, + TransformerLambda) def manual_expanding(df: pd.DataFrame, min_points: int = 1): @@ -717,3 +712,130 @@ def test_resampler(self): resampler.partial_fit(A) q = resampler.transform(A) assert np.sum(q["B"] - np.array([0, 4, 8])) < 0.000005 + + +def test_utils(): + + input = pd.DataFrame({"a": [1, 2, 3]}) + out = PandasToNumpy().fit_transform(input) + assert isinstance(out, np.ndarray) + assert out.shape == (3, 1) + + out = TransformerLambda(callback=lambda x: x*2).fit_transform(input) + assert out["a"].tolist() == [2, 4, 6] + + out = IdentityTransformerStep().fit_transform(input) + assert out["a"].tolist() == [1, 2, 3] + + out = IdentityTransformerStep().fit_transform(input.values) + assert out.tolist() == [[1], [2], [3]] + + +def test_transformation(): + input = pd.DataFrame({"a": [5, 10, 15]}) + out = MeanCentering().fit_transform(input) + assert out["a"].tolist() == [-5, 0, 5] + + out = MedianCentering().fit_transform(input) + assert out["a"].tolist() == [-5, 0, 5] + + input1 = pd.DataFrame({"a": np.random.randn(200)*5 + 50}) + input2 = pd.DataFrame({"a": np.random.randn(200)*15 + 50}) + + + centering = (MedianCentering() + .partial_fit(input1) + .partial_fit(input2) + ) + + assert np.abs(centering.median["a"] - 50 ) < 1 + out = centering.transform(input1) + assert out.equals(input1 - centering.median) + + centering = (MedianCentering() + .partial_fit(pd.DataFrame({"a": np.random.randn(1)*5 + 50})) + ) + assert centering.median is None + + square = Square() + out = square.fit_transform(input) + assert out["a"].tolist() == [25, 100, 225] + + scaler = Scale(scale_factor=2) + out = scaler.fit_transform(input) + assert out["a"].tolist() == [10, 20, 30] + + + expanding_centering = ExpandingCentering() + out = expanding_centering.fit_transform(input1) + assert out.equals(input1 - input1.expanding().mean()) + + input = pd.DataFrame({"a": np.random.randn(200)*5 + 50}) + rolling_centering = RollingCentering(window=5, min_points=1) + out = rolling_centering.fit_transform(input) + assert out.equals(input - input.rolling(window=5, min_periods=1).mean()) + + input = pd.DataFrame({"a": np.random.randn(200)*5 + 50}) + sqrt = Sqrt() + out = sqrt.fit_transform(input) + assert out["a"].tolist() == np.sqrt(input["a"]).tolist() + + expanding_normalization = ExpandingNormalization() + out = expanding_normalization.fit_transform(input) + assert out.equals((input - input.expanding().mean()) / (input.expanding().std())) + + accumulate = Accumulate() + out = accumulate.fit_transform(input) + assert out.equals(input.cumsum()) + + accumulate = Accumulate(normalize=True) + out = accumulate.fit_transform(input) + cummulated = input.cumsum() + assert out.equals(cummulated / cummulated.abs().apply(np.sqrt)) + + peaks = Peaks(distance=3) + input = pd.DataFrame({"a": np.cos(np.arange(-np.pi, 3*np.pi, np.pi/4))}) + out = peaks.fit_transform(input) + assert out["a"].sum() == 2 + assert out["a"].iloc[4] == 1 + assert out["a"].iloc[12] == 1 + + clip = Clip(lower=0, upper=1) + input = pd.DataFrame({"a": np.linspace(-1, 2, 10)}) + out = clip.fit_transform(input) + assert out["a"].min() == 0 + assert out["a"].max() == 1 + + substract_linebase = SubstractLinebase() + input = pd.DataFrame({"a": np.linspace(5, 10, 10)}) + out = substract_linebase.fit_transform(input) + assert out["a"].min() == 0 + assert out["a"].max() == 5 + + diff = Diff() + input = pd.DataFrame({"a": np.linspace(5, 10, 10)}) + out = diff.fit_transform(input) + assert out.equals(input.diff()) + + apply = Apply(fun=lambda x: x**2) + input = pd.DataFrame({"a": np.linspace(5, 10, 10)}) + out = apply.fit_transform(input) + assert out.equals(input.apply(lambda x: x**2)) + + string_concat = StringConcatenate() + input = pd.DataFrame({"a": ["a", "b", "c"], + "b": ["d", "e", "f"]}) + out = string_concat.fit_transform(input) + assert out["concatenation"].tolist() == ["a-d", "b-e", "c-f"] + + + + +def test_slicing(): + slicer = SliceRows( + initial = RelativeToStart(5), + final = RelativeToEnd(5), + ) + input = pd.DataFrame({"a": np.linspace(5, 10, 10)}) + out = slicer.fit_transform(input) + assert out.equals(input.iloc[5:-5]) \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 7e8b439e..e9e22d7e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,7 @@ import numpy as np from ceruleo.iterators.iterators import windowed_signal_generator +from ceruleo.utils.download import download class TestWindowGeneratorTest: @@ -28,3 +29,6 @@ def test_signal_generator(self): assert X_w[-1, 0] == X[2, 0] assert (np.squeeze(y[[3, 4]]) == np.squeeze(y_w)).all() + +def test_download(): + download("http://www.google.com", "test.html")