-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
250 lines (205 loc) · 8.28 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# %% [markdown]
# # XGBoost
# * TODO: add averages for cells and stations, NB inference code will need to be adapted too
# * TODO: target manipulations/engineering
# * rolling autocorrelation
# * TODO: (vector leaf) multi-output regression
# * TODO: maybe for week 10-11, we should have radically different approach that does not rely on lag feats, since these will be heavily inpacted by compound errors
# * maybe we should train lin reg for trend and xgboost for seasonality only on idx feats
# %% [markdown]
# ## Roadmap Note
# Regarding your plan to expand the script to first fit a linear model and then apply XGBoost on the residuals, that's a solid approach known as model stacking or residual modeling. This can be set up as a parameter in W&B for flexibility. When you're ready to implement it, you might consider:
#
# * Implementing a Pipeline: Use scikit-learn's Pipeline to chain the linear model and XGBoost.
# * Parameterization: Add a parameter in your config (e.g., use_linear_model) to toggle this behavior.
# * Logging: Use W&B to track both models' performances separately and combined.
# %%
# %%
from tqdm.notebook import tqdm
from wandb.integration.xgboost import WandbCallback
import wandb
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from optuna_integration.xgboost import XGBoostPruningCallback
from optuna_integration.wandb import WeightsAndBiasesCallback
import optuna
import xgboost as xgb
import sklearn
import numpy as np
import polars as pl
import pandas as pd
import pickle
from typing import Callable
import math
from pathlib import Path
import os
import yaml
import utils
# Manually set the notebook name
# os.environ["WANDB_NOTEBOOK_NAME"] = "xgboost_train.ipynb"
# %%
# %%
DEBUG = False
# config_file_path = Path('configs') / 'linear_config.yaml'
# config_file_path = Path('configs') / 'autoregressive_config.yaml'
# config_file_path = Path('configs') / '168hour_shift_config.yaml'
config_file_path = Path('configs') / 'SWEEP_autoregressive_config.yaml'
# Load the YAML configuration file
with open(config_file_path, 'r') as file:
config = yaml.safe_load(file)
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
config = dotdict(config)
data_dir = Path('input-data')
target_dataframes = {
# This is the target variable
'thp_vol': pl.read_csv(data_dir / 'traffic_DLThpVol.csv'),
'prb': pl.read_csv(data_dir / 'traffic_DLPRB.csv'),
'thp_time': pl.read_csv(data_dir / 'traffic_DLThpTime.csv'),
'mr_number': pl.read_csv(data_dir / 'traffic_MR_number.csv')
}
# Filter target dataframes based on config
target_dataframes = {
k: v for k, v in target_dataframes.items() if k in config.target_df_names}
idx_hour_series = target_dataframes['thp_vol']['']
# Drop the first column (idx hour) from each dataframe
for k in target_dataframes:
target_dataframes[k] = target_dataframes[k].drop('')
# Debug mode: shorten dataframes and config lists
if DEBUG:
target_dataframes = {k: v.head(200).select(
v.columns[:800]) for k, v in target_dataframes.items()}
# Merge xgb_hyperparams into config
# config.update(xgb_hyperparams)
# %%
# %%
# Save utils.py to W&B
utils_path = Path('utils.py')
# %%
# Extract xgb_hyperparams from config
# xgb_hyperparams = config.get('xgb_hyperparams', {})
# Merge xgb_hyperparams into config
# config.update(xgb_hyperparams)
# %%
# Use first config.train_percentage of dataframe rows for training, and the rest for validation and testing
num_rows = len(target_dataframes['thp_vol'])
num_train_rows = round(num_rows * config.train_percentage)
num_val_rows = round(num_rows * config.val_percentage)
# Make feature dataframes
feature_dfs = utils.create_all_feature_dfs(
target_dataframes, idx_hour_series, config)
# %%
def train():
# Initialize W&B
run = wandb.init(
project="traffic-forecasting-challenge",
job_type='train',
entity="esedx12",
save_code=True,
mode=('dryrun' if DEBUG else 'online')
)
# %%
# Read the CSV files
if utils_path.exists():
wandb.save(str(utils_path))
# %% [markdown]
# ## Feature Engineering
#
# The feature engineering steps are handled by utility functions.
# %%
config = dotdict(wandb.config.as_dict())
train_target_dfs = {k: v.head(num_train_rows).shift(-config.target_forward_shift)
for k, v in target_dataframes.items()}
train_feature_dfs = {k: v.head(num_train_rows)
for k, v in feature_dfs.items()}
train_idx_hour_series = idx_hour_series.head(num_train_rows)
val_target_dfs = {k: v.slice(num_train_rows + 1, num_val_rows).shift(-config.target_forward_shift)
for k, v in target_dataframes.items()}
val_feature_dfs = {k: v.slice(num_train_rows + 1, num_val_rows)
for k, v in feature_dfs.items()}
val_idx_hour_series = idx_hour_series.slice(num_train_rows + 1, num_val_rows)
# %%
# Create long format dataframes using utility functions
long_train_df = utils.create_long_format_df(
train_target_dfs, train_feature_dfs, train_idx_hour_series, wandb.config)
long_val_df = utils.create_long_format_df(
val_target_dfs, val_feature_dfs, val_idx_hour_series, wandb.config)
target_cols = list(target_dataframes.keys())
# Assuming long_train_df and long_val_df are pandas DataFrames
X_train = long_train_df.drop(columns=target_cols)
y_train = long_train_df[target_cols]
X_val = long_val_df.drop(columns=target_cols)
y_val = long_val_df[target_cols]
wandb.config.update({
'num_train_samples': len(X_train),
'num_val_samples': len(X_val),
'features': X_train.columns.to_list(),
'targets': y_train.columns.to_list()
})
# %% [markdown]
# ## Train Models
# * TODO if indicated for performance reasons, get the max idx_hour with a null and return it so we can shorten the df for multi-step predict
# * TODO also add target transformations (maybe sklearn can help)
# * TODO normalize somehow if data is on very different scales for different beams
# %% [markdown]
# ### Fit models
# %%
# sk-learn linear model
if wandb.config.model == 'linear':
models = {}
for target in target_cols:
model = sklearn.linear_model.LinearRegression()
model.fit(pd.get_dummies(X_train), y_train[target])
models[target] = model
# wandb log and print some metrics, like mae
y_pred = model.predict(pd.get_dummies(X_val))
mae = sklearn.metrics.mean_absolute_error(y_val[target], y_pred)
wandb.log({f'mae_{target}': mae})
print(f'MAE for {target}: {mae}')
# %%
X_train.columns[:10]
# %%
# xgboost model
if wandb.config.model == 'xgboost':
models = {}
for target_name in y_train.columns[:1]:
model = xgb.XGBRegressor(
**config, callbacks=[WandbCallback(log_model=True)])
print(f"\nFitting model for {target_name}:")
model.fit(
X_train,
y_train[target_name],
eval_set=[(X_train, y_train[target_name]),
(X_val, y_val[target_name])],
verbose=25
)
models[target_name] = model
# %% [markdown]
# ### Save models
# %%
# %%
for target_name, model in models.items():
model_dir = Path('checkpoints_final')
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / f'forward_shift_{wandb.config.target_forward_shift}'
pickle.dump(model, open(model_path, 'wb'))
# wandb.save(str(model_path))
# %%
wandb.finish()
if __name__ == '__main__':
sweep_configuration_path = 'wandb_sweep.yaml'
with open(sweep_configuration_path, 'r') as file:
sweep_configuration = yaml.safe_load(file)
sweep_id = wandb.sweep(sweep=sweep_configuration, project="traffic-forecasting-challenge")
wandb.agent(sweep_id, function=train, count=168)
# %%
# %%
# %%