diff --git a/docs/sdk/examples/synthesize_tabular_data.md b/docs/sdk/examples/synthesize_tabular_data.md index 7e6b15dd..dd49099a 100644 --- a/docs/sdk/examples/synthesize_tabular_data.md +++ b/docs/sdk/examples/synthesize_tabular_data.md @@ -2,6 +2,8 @@ **Use YData's *RegularSynthesizer* to generate tabular synthetic data** +For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master). + ```python --8<-- "examples/synthesizer_from_pandas.py" ``` diff --git a/docs/sdk/examples/synthesize_timeseries_data.md b/docs/sdk/examples/synthesize_timeseries_data.md index a224a530..55f37419 100644 --- a/docs/sdk/examples/synthesize_timeseries_data.md +++ b/docs/sdk/examples/synthesize_timeseries_data.md @@ -14,6 +14,8 @@ Dissecting any time-series dataset, we see differences in variables' behavior th - Variables that refer to entities (single or multiple entities) - Variables that are attributes (those that don't depend on time but rather on the entity) +For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master). + Below find an example: ```python diff --git a/examples/synthesizers/time_series_quickstart.py b/examples/synthesizers/time_series_quickstart.py index 9b1da0d1..4c41444b 100644 --- a/examples/synthesizers/time_series_quickstart.py +++ b/examples/synthesizers/time_series_quickstart.py @@ -1,21 +1,52 @@ +# -*- coding: utf-8 -*- + +# Authentication import os from ydata.sdk.dataset import get_dataset from ydata.sdk.synthesizers import TimeSeriesSynthesizer # Do not forget to add your token as env variable -os.environ["YDATA_TOKEN"] = '' +os.environ["YDATA_TOKEN"] = '{insert-token}' + + +# Sampling an example dataset for a multientity & multivariate time-series dataset""" + +# Generate the dataset +time_series_data = get_dataset('timeseries') -X = get_dataset('occupancy') +# Print the first few rows of the dataset +print(time_series_data.head()) + +# Train a Synthetic data generator + +# From a pandas dataframe # We initialize a time series synthesizer # As long as the synthesizer does not call `fit`, it exists only locally -synth = TimeSeriesSynthesizer() +synth = TimeSeriesSynthesizer(name='Time-series synth') # We train the synthesizer on our dataset # sortbykey -> variable that define the time order for the sequence -synth.fit(X, sortbykey='date') +synth.fit(time_series_data, sortbykey='time', entities='entity_id') + +# Generate samples from an already trained synthesizer +# From the synthesizer in context in the notebook + + +# Generate a sample with x number of entities +# In this example the objective is to generate a dataset with the same size as the original. For that reason, 5 entities will be generated. +sample = synth.sample(n_entities=5) + +sample.head() + +# From a previously trained synthetic data generation model +# List the trained synthetic data generators to get the uid synthetisizer +TimeSeriesSynthesizer.list() + +synth = TimeSeriesSynthesizer(uid='{insert-synth-id}').get() + +# Generate a new synthetic dataset with the sample method +sample = synth.sample(n_entities=5) -# By default it is requested a synthetic sample with the same length as the original data -# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected -sample = synth.sample(n_entities=1) +sample.head() diff --git a/src/ydata/sdk/dataset/dataset.py b/src/ydata/sdk/dataset/dataset.py index fe2b465c..1a573976 100644 --- a/src/ydata/sdk/dataset/dataset.py +++ b/src/ydata/sdk/dataset/dataset.py @@ -1,3 +1,4 @@ +import numpy as np from numpy import int64 from pandas import DataFrame as pdDataFrame from pandas import read_csv, to_datetime @@ -5,6 +6,50 @@ from ydata.sdk.utils.cache import cache_file +def get_timeseries() -> pdDataFrame: + def generate_multivariate_multientity_timeseries(num_rows=1000, num_entities=5, num_timesteps=10): + """Generates a multivariate, multi-entity time series dataset. + + Args: + num_rows: The number of rows in the dataset. Defaults to 1000. + num_entities: The number of entities in the dataset. Defaults to 5. + num_timesteps: The number of time steps for each entity. Defaults to 10. + + Returns: + A pandas DataFrame representing the time-series dataset. + """ + + data = [] + for entity in range(num_entities): + for t in range(num_timesteps): + row = { + 'entity_id': entity, + 'time': t + } + for feature in range(3): + # Simulate some random data + row[f'feature_{feature}'] = np.random.rand() + data.append(row) + + # Adding more rows to meet the desired number of rows + additional_rows = max(0, num_rows - len(data)) + for _ in range(additional_rows): + entity = np.random.randint(0, num_entities) + t = np.random.randint(0, num_timesteps) + row = { + 'entity_id': entity, + 'time': t + } + for feature in range(3): + row[f'feature_{feature}'] = np.random.rand() + data.append(row) + df = pdDataFrame(data) + + return df + + return generate_multivariate_multientity_timeseries() + + def get_census() -> pdDataFrame: file_name = cache_file( "census_train.csv", @@ -75,7 +120,7 @@ def get_dataset(name: str): 'census': get_census, 'titanic': get_titanic, 'airquality': get_airquality, - 'occupancy': get_occupancy + 'timeseries': get_timeseries } if name not in DATASETS: