From 0ffdecaa6a145c0781ec7b40c104248a0f188f2c Mon Sep 17 00:00:00 2001 From: Luciano Lorenti Date: Mon, 13 May 2024 17:53:05 +0200 Subject: [PATCH] Iterators: Allow sample weights to use dataframes, series and arrays --- ceruleo/iterators/sample_weight.py | 38 +++++++--- tests/test_iterators.py | 109 ++++++++++++++++++++--------- tests/test_sklearn.py | 50 ++++++++++--- 3 files changed, 145 insertions(+), 52 deletions(-) diff --git a/ceruleo/iterators/sample_weight.py b/ceruleo/iterators/sample_weight.py index ac36d997..da1fa86c 100644 --- a/ceruleo/iterators/sample_weight.py +++ b/ceruleo/iterators/sample_weight.py @@ -1,9 +1,8 @@ -from abc import abstractmethod -from signal import signal from typing import Any, Callable, Union import numpy as np +import pandas as pd class AbstractSampleWeights: @@ -11,10 +10,26 @@ class AbstractSampleWeights: The base class for the sample weight provider """ - def __call__(self, y, i: int, metadata): + def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata): raise NotImplementedError + +def get_value(y: Union[np.ndarray, pd.DataFrame], i:int) -> float: + if isinstance(y, np.ndarray): + if len(y.shape) > 1: + return y[i, 0] + else: + return y[i] + elif isinstance(y, pd.DataFrame): + return y.iloc[i, 0] + elif isinstance(y, pd.Series): + return y.iloc[i] + else: + raise ValueError(f"Unsupported type {type(y)}") + + + class NotWeighted(AbstractSampleWeights): """ Simplest sample weight provider @@ -22,7 +37,7 @@ class NotWeighted(AbstractSampleWeights): Provide 1 as a sample weight for every sample """ - def __call__(self, y, i: int, metadata): + def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata): return 1 @@ -41,8 +56,10 @@ class RULInverseWeighted(AbstractSampleWeights): Weight each sample by the inverse of the RUL """ - def __call__(self, y, i: int, metadata): - return 1 / (y[i, 0] + 1) + def __call__(self, y : Union[np.ndarray, pd.DataFrame], i: int, metadata): + return 1 / (get_value(y, i) + 1) + + class InverseToLengthWeighted(AbstractSampleWeights): @@ -53,8 +70,8 @@ class InverseToLengthWeighted(AbstractSampleWeights): """ - def __call__(self, y, i: int, metadata): - return 1 / y[0] + def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata): + return 1 / get_value(y, 0) class ExponentialDecay(AbstractSampleWeights): @@ -64,8 +81,9 @@ class ExponentialDecay(AbstractSampleWeights): """ def __init__(self, *, near_0_at: float): + super().__init__() self.alpha = -((near_0_at) ** 2) / np.log(0.000001) - def __call__(self, y, i: int, metadata): - return (1 + np.exp(-(y[i, 0] ** 2) / self.alpha)) ** 2 + def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata): + return ( np.exp(-(get_value(y,i) ** 2) / self.alpha)) diff --git a/tests/test_iterators.py b/tests/test_iterators.py index 49357c55..6d3de450 100644 --- a/tests/test_iterators.py +++ b/tests/test_iterators.py @@ -3,27 +3,31 @@ from ceruleo.dataset.ts_dataset import AbstractPDMDataset from ceruleo.iterators.batcher import Batcher from ceruleo.iterators.iterators import WindowedDatasetIterator -from ceruleo.transformation import Pipeline, Transformer +from ceruleo.iterators.sample_weight import ( + ExponentialDecay, + InverseToLengthWeighted, + NotWeighted, + RULInverseWeighted, +) +from ceruleo.transformation import Transformer from ceruleo.transformation.features.scalers import MinMaxScaler from ceruleo.transformation.features.selection import ByNameFeatureSelector class SimpleDataset(AbstractPDMDataset): def __init__(self): - self.lives = [ - pd.DataFrame({ - 'feature1': np.array(range(0, 100)), - 'RUL': np.array(range(0, 100)) - })] - + pd.DataFrame( + {"feature1": np.array(range(0, 100)), "RUL": np.array(range(0, 100))} + ) + ] def get_time_series(self, i: int): return self.lives[i] @property def rul_column(self): - return 'RUL' + return "RUL" @property def n_time_series(self): @@ -32,22 +36,26 @@ def n_time_series(self): class MockDataset(AbstractPDMDataset): def __init__(self, nlives: int): - self.lives = [ - pd.DataFrame({ - 'feature1': np.linspace(0, (i+1)*100, 50), - 'feature2': np.linspace(-25, (i+1)*500, 50), - 'RUL': np.linspace(100, 0, 50) - }) - for i in range(nlives-1)] + pd.DataFrame( + { + "feature1": np.linspace(0, (i + 1) * 100, 50), + "feature2": np.linspace(-25, (i + 1) * 500, 50), + "RUL": np.linspace(100, 0, 50), + } + ) + for i in range(nlives - 1) + ] self.lives.append( - pd.DataFrame({ - 'feature1': np.linspace(0, 5*100, 50), - 'feature2': np.linspace(-25, 5*500, 50), - 'feature3': np.linspace(-25, 5*500, 50), - 'RUL': np.linspace(100, 0, 50) - }) + pd.DataFrame( + { + "feature1": np.linspace(0, 5 * 100, 50), + "feature2": np.linspace(-25, 5 * 500, 50), + "feature3": np.linspace(-25, 5 * 500, 50), + "RUL": np.linspace(100, 0, 50), + } + ) ) def get_time_series(self, i: int): @@ -55,25 +63,25 @@ def get_time_series(self, i: int): @property def rul_column(self): - return 'RUL' + return "RUL" @property def n_time_series(self): return len(self.lives) -class TestIterators(): +class TestIterators: def test_iterators(self): - features = ['feature1', 'feature2'] + features = ["feature1", "feature2"] x = ByNameFeatureSelector(features=features) x = MinMaxScaler(range=(-1, 1))(x) - y = ByNameFeatureSelector(features=['RUL']) + y = ByNameFeatureSelector(features=["RUL"]) transformer = Transformer(x, y) batch_size = 15 window_size = 5 ds = MockDataset(5) - + transformer.fit(ds) b = Batcher.new(ds.map(transformer), window_size, batch_size, 1) X, y, w = next(b) @@ -84,14 +92,47 @@ def test_iterators(self): def test_2(self): dataset = SimpleDataset() - pipe = ByNameFeatureSelector(features=['feature1']) - y_pipe = ByNameFeatureSelector(features=['RUL']) - transformer_raw = Transformer( - pipelineX=pipe, - pipelineY=y_pipe - ) + pipe = ByNameFeatureSelector(features=["feature1"]) + y_pipe = ByNameFeatureSelector(features=["RUL"]) + transformer_raw = Transformer(pipelineX=pipe, pipelineY=y_pipe) transformer_raw.fit(dataset) - it = WindowedDatasetIterator(dataset.map(transformer_raw), 5) + it = WindowedDatasetIterator(dataset.map(transformer_raw), 5) X, y, sw = next(it) - assert np.all(X == np.array([[0,1,2,3,4]]).T) + assert np.all(X == np.array([[0, 1, 2, 3, 4]]).T) assert y[0][0] == 4 + + +def build_elements(): + a = np.linspace(10, 0, 11) + b = np.vstack((a, a)).T + c = pd.DataFrame(a, columns=["RUL"]) + d = pd.DataFrame(b, columns=["RUL", "RUL2"]) + e = pd.Series(a) + return [a, b, c, d, e] + + +class TestSampleWeight: + def test_not_weighted(self): + nw = NotWeighted() + for el in build_elements(): + assert nw(el, 0, None) == 1 + assert nw(el, 5, None) == 1 + + def test_rul_inverse_weighted(self): + inverse = RULInverseWeighted() + for el in build_elements(): + assert inverse(el, 0, None) == 1.0/(10 + 1) + assert inverse(el, 5, None) == 1.0/(5 + 1) + assert inverse(el, 10, None) == 1.0 + + def test_InverseToLengthWeighted(self): + inverse = InverseToLengthWeighted() + for el in build_elements(): + assert inverse(el, 0, None) == 1/(10.0) + assert inverse(el, 5, None) == 1/(10.0) + + def test_ExponentialDecay(self): + exp = ExponentialDecay(near_0_at=3) + for el in build_elements(): + assert exp(el, 0, None) < 0.00001 + assert exp(el, 9, None) > 0.21 \ No newline at end of file diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index e7533832..b2de39c1 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1,16 +1,19 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import GridSearchCV + from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset, sensor_indices -from ceruleo.transformation import Transformer -from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline -from ceruleo.transformation.features.selection import ByNameFeatureSelector -from ceruleo.transformation.features.scalers import MinMaxScaler +from ceruleo.iterators.sample_weight import RULInverseWeighted from ceruleo.models.sklearn import ( + CeruleoMetricWrapper, CeruleoRegressor, TimeSeriesWindowTransformer, - CeruleoMetricWrapper ) -from sklearn.linear_model import Ridge -from sklearn.model_selection import GridSearchCV -from sklearn.ensemble import RandomForestRegressor +from ceruleo.transformation import Transformer +from ceruleo.transformation.features.scalers import MinMaxScaler +from ceruleo.transformation.features.selection import ByNameFeatureSelector +from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline + def test_gridsearch_cv(): train_dataset = CMAPSSDataset(train=True, models='FD001') @@ -48,3 +51,34 @@ def test_gridsearch_cv(): grid_search.fit(train_dataset) assert grid_search is not None + + + +def test_sample_weights(): + train_dataset = CMAPSSDataset(train=True, models='FD001') + FEATURES = [train_dataset[0].columns[i] for i in sensor_indices] + transformer = Transformer( + pipelineX=make_pipeline( + ByNameFeatureSelector(features=FEATURES), + MinMaxScaler(range=(-1, 1)) + + ), + pipelineY=make_pipeline( + ByNameFeatureSelector(features=['RUL']), + ) + ) + + + regressor_gs = CeruleoRegressor( + TimeSeriesWindowTransformer( + transformer, + window_size=32, + sample_weight=RULInverseWeighted(), + padding=True, + step=1), + Ridge(alpha=15)) + + + + regressor_gs = regressor_gs.fit(train_dataset) + assert regressor_gs is not None