-
Notifications
You must be signed in to change notification settings - Fork 34
/
split.py
22 lines (16 loc) · 865 Bytes
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
"""
This module defines the following routines used by the 'split' step of the regression recipe:
- ``create_dataset_filter``: Defines customizable logic for filtering the training
datasets produced by the data splitting procedure. Note that arbitrary transformations
should go into the transform step.
"""
from pandas import DataFrame, Series
def create_dataset_filter(dataset: DataFrame) -> Series(bool):
"""
Mark rows of the split datasets to be additionally filtered. This function will be called on
the training datasets.
:param dataset: The {train,validation,test} dataset produced by the data splitting procedure.
:return: A Series indicating whether each row should be filtered
"""
# FIXME::OPTIONAL: implement post-split filtering on the dataframes, such as data cleaning.
return Series(True, index=dataset.index)