Skip to content

Commit

Permalink
Iterators: Allow sample weights to use dataframes, series and arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
lucianolorenti committed May 13, 2024
1 parent 4cc7edb commit 0ffdeca
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 52 deletions.
38 changes: 28 additions & 10 deletions ceruleo/iterators/sample_weight.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,43 @@
from abc import abstractmethod

from signal import signal
from typing import Any, Callable, Union

import numpy as np
import pandas as pd


class AbstractSampleWeights:
"""
The base class for the sample weight provider
"""

def __call__(self, y, i: int, metadata):
def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata):
raise NotImplementedError



def get_value(y: Union[np.ndarray, pd.DataFrame], i:int) -> float:
if isinstance(y, np.ndarray):
if len(y.shape) > 1:
return y[i, 0]
else:
return y[i]
elif isinstance(y, pd.DataFrame):
return y.iloc[i, 0]
elif isinstance(y, pd.Series):
return y.iloc[i]
else:
raise ValueError(f"Unsupported type {type(y)}")



class NotWeighted(AbstractSampleWeights):
"""
Simplest sample weight provider
Provide 1 as a sample weight for every sample
"""

def __call__(self, y, i: int, metadata):
def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata):
return 1


Expand All @@ -41,8 +56,10 @@ class RULInverseWeighted(AbstractSampleWeights):
Weight each sample by the inverse of the RUL
"""

def __call__(self, y, i: int, metadata):
return 1 / (y[i, 0] + 1)
def __call__(self, y : Union[np.ndarray, pd.DataFrame], i: int, metadata):
return 1 / (get_value(y, i) + 1)




class InverseToLengthWeighted(AbstractSampleWeights):
Expand All @@ -53,8 +70,8 @@ class InverseToLengthWeighted(AbstractSampleWeights):
"""

def __call__(self, y, i: int, metadata):
return 1 / y[0]
def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata):
return 1 / get_value(y, 0)


class ExponentialDecay(AbstractSampleWeights):
Expand All @@ -64,8 +81,9 @@ class ExponentialDecay(AbstractSampleWeights):
"""

def __init__(self, *, near_0_at: float):

super().__init__()
self.alpha = -((near_0_at) ** 2) / np.log(0.000001)

def __call__(self, y, i: int, metadata):
return (1 + np.exp(-(y[i, 0] ** 2) / self.alpha)) ** 2
def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata):
return ( np.exp(-(get_value(y,i) ** 2) / self.alpha))
109 changes: 75 additions & 34 deletions tests/test_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,31 @@
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.iterators.batcher import Batcher
from ceruleo.iterators.iterators import WindowedDatasetIterator
from ceruleo.transformation import Pipeline, Transformer
from ceruleo.iterators.sample_weight import (
ExponentialDecay,
InverseToLengthWeighted,
NotWeighted,
RULInverseWeighted,
)
from ceruleo.transformation import Transformer
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.transformation.features.selection import ByNameFeatureSelector


class SimpleDataset(AbstractPDMDataset):
def __init__(self):

self.lives = [
pd.DataFrame({
'feature1': np.array(range(0, 100)),
'RUL': np.array(range(0, 100))
})]

pd.DataFrame(
{"feature1": np.array(range(0, 100)), "RUL": np.array(range(0, 100))}
)
]

def get_time_series(self, i: int):
return self.lives[i]

@property
def rul_column(self):
return 'RUL'
return "RUL"

@property
def n_time_series(self):
Expand All @@ -32,48 +36,52 @@ def n_time_series(self):

class MockDataset(AbstractPDMDataset):
def __init__(self, nlives: int):

self.lives = [
pd.DataFrame({
'feature1': np.linspace(0, (i+1)*100, 50),
'feature2': np.linspace(-25, (i+1)*500, 50),
'RUL': np.linspace(100, 0, 50)
})
for i in range(nlives-1)]
pd.DataFrame(
{
"feature1": np.linspace(0, (i + 1) * 100, 50),
"feature2": np.linspace(-25, (i + 1) * 500, 50),
"RUL": np.linspace(100, 0, 50),
}
)
for i in range(nlives - 1)
]

self.lives.append(
pd.DataFrame({
'feature1': np.linspace(0, 5*100, 50),
'feature2': np.linspace(-25, 5*500, 50),
'feature3': np.linspace(-25, 5*500, 50),
'RUL': np.linspace(100, 0, 50)
})
pd.DataFrame(
{
"feature1": np.linspace(0, 5 * 100, 50),
"feature2": np.linspace(-25, 5 * 500, 50),
"feature3": np.linspace(-25, 5 * 500, 50),
"RUL": np.linspace(100, 0, 50),
}
)
)

def get_time_series(self, i: int):
return self.lives[i]

@property
def rul_column(self):
return 'RUL'
return "RUL"

@property
def n_time_series(self):
return len(self.lives)


class TestIterators():
class TestIterators:
def test_iterators(self):
features = ['feature1', 'feature2']
features = ["feature1", "feature2"]
x = ByNameFeatureSelector(features=features)
x = MinMaxScaler(range=(-1, 1))(x)

y = ByNameFeatureSelector(features=['RUL'])
y = ByNameFeatureSelector(features=["RUL"])
transformer = Transformer(x, y)
batch_size = 15
window_size = 5
ds = MockDataset(5)

transformer.fit(ds)
b = Batcher.new(ds.map(transformer), window_size, batch_size, 1)
X, y, w = next(b)
Expand All @@ -84,14 +92,47 @@ def test_iterators(self):

def test_2(self):
dataset = SimpleDataset()
pipe = ByNameFeatureSelector(features=['feature1'])
y_pipe = ByNameFeatureSelector(features=['RUL'])
transformer_raw = Transformer(
pipelineX=pipe,
pipelineY=y_pipe
)
pipe = ByNameFeatureSelector(features=["feature1"])
y_pipe = ByNameFeatureSelector(features=["RUL"])
transformer_raw = Transformer(pipelineX=pipe, pipelineY=y_pipe)
transformer_raw.fit(dataset)
it = WindowedDatasetIterator(dataset.map(transformer_raw), 5)
it = WindowedDatasetIterator(dataset.map(transformer_raw), 5)
X, y, sw = next(it)
assert np.all(X == np.array([[0,1,2,3,4]]).T)
assert np.all(X == np.array([[0, 1, 2, 3, 4]]).T)
assert y[0][0] == 4


def build_elements():
a = np.linspace(10, 0, 11)
b = np.vstack((a, a)).T
c = pd.DataFrame(a, columns=["RUL"])
d = pd.DataFrame(b, columns=["RUL", "RUL2"])
e = pd.Series(a)
return [a, b, c, d, e]


class TestSampleWeight:
def test_not_weighted(self):
nw = NotWeighted()
for el in build_elements():
assert nw(el, 0, None) == 1
assert nw(el, 5, None) == 1

def test_rul_inverse_weighted(self):
inverse = RULInverseWeighted()
for el in build_elements():
assert inverse(el, 0, None) == 1.0/(10 + 1)
assert inverse(el, 5, None) == 1.0/(5 + 1)
assert inverse(el, 10, None) == 1.0

def test_InverseToLengthWeighted(self):
inverse = InverseToLengthWeighted()
for el in build_elements():
assert inverse(el, 0, None) == 1/(10.0)
assert inverse(el, 5, None) == 1/(10.0)

def test_ExponentialDecay(self):
exp = ExponentialDecay(near_0_at=3)
for el in build_elements():
assert exp(el, 0, None) < 0.00001
assert exp(el, 9, None) > 0.21
50 changes: 42 additions & 8 deletions tests/test_sklearn.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset, sensor_indices
from ceruleo.transformation import Transformer
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.iterators.sample_weight import RULInverseWeighted
from ceruleo.models.sklearn import (
CeruleoMetricWrapper,
CeruleoRegressor,
TimeSeriesWindowTransformer,
CeruleoMetricWrapper
)
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ceruleo.transformation import Transformer
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline


def test_gridsearch_cv():
train_dataset = CMAPSSDataset(train=True, models='FD001')
Expand Down Expand Up @@ -48,3 +51,34 @@ def test_gridsearch_cv():

grid_search.fit(train_dataset)
assert grid_search is not None



def test_sample_weights():
train_dataset = CMAPSSDataset(train=True, models='FD001')
FEATURES = [train_dataset[0].columns[i] for i in sensor_indices]
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))

),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)


regressor_gs = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
sample_weight=RULInverseWeighted(),
padding=True,
step=1),
Ridge(alpha=15))



regressor_gs = regressor_gs.fit(train_dataset)
assert regressor_gs is not None

0 comments on commit 0ffdeca

Please sign in to comment.