-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdataproc.py
1046 lines (824 loc) · 43 KB
/
dataproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ADULT
# info: https://archive.ics.uci.edu/ml/datasets/adult
# (gerry fair version: https://github.com/algowatchpenn/GerryFair)
# COMPAS
# info: https://www.kaggle.com/danofer/compass
# BOSTON HOUSING
# info: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html
# SOUTH GERMAN CREDIT
# info: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29
# GARMENT PRODUCTIVITY
# info: https://archive.ics.uci.edu/ml/datasets/Productivity+Prediction+of+Garment+Employees
import numpy as np
import pandas as pd
# helper functions
'''
def get_one_perc(intervals, feat_name):
range = intervals[feat_name][1] - intervals[feat_name][0]
return range / 100.0
def get_iqr(df, feat_name):
q1 = df[feat_name].quantile(0.25)
q3 = df[feat_name].quantile(0.75)
iqr = q3 - q1
return iqr
def check_abp(feature_names, categorical_feature_names, a,b,p):
for feat in feature_names:
assert(p[feat] in [None,'<=','=','>='])
if feat in categorical_feature_names:
assert(a[feat] is None or (len(a[feat])>1))
assert(b[feat] is None or (len(b[feat])>1))
else:
assert(a[feat] is None or (a[feat][0] <= 0 and a[feat][1] >= 0))
assert(b[feat] is None or (b[feat][0] <= 0 and b[feat][1] >= 0))
'''
def check_X_respects_intervals(X, feature_intervals, indices_categorical_features):
for x in X:
for i, x_i in enumerate(x):
if i in indices_categorical_features:
assert(x_i in feature_intervals[i])
else:
assert(x_i >= feature_intervals[i][0] and x_i <= feature_intervals[i][1])
# gets a dataset in my format and re-arranges it for LORE
def make_lore_compliant_dataset(dataset):
# imports needed only if using this
from lore.util import recognize_features_type, set_discrete_continuous, label_encode
df = dataset['df']
# LORE wants the label to be the first column instead of the last one
columns = list(df.columns)
columns = columns[-1:] + columns[:-1]
df = df[columns]
possible_outcomes = list(df['LABEL'].unique())
type_features, features_type = recognize_features_type(df, 'LABEL')
discrete, continuous = set_discrete_continuous(dataset['feature_names'], type_features, 'LABEL', dataset['categorical_feature_names'], continuous=None)
idx_features = {i: col for i, col in enumerate(dataset['feature_names'])}
_, label_encoder = label_encode(df, discrete)
# check that order of columns matches order of features
assert(list(df.columns) == ['LABEL'] + dataset['feature_names'])
lore_dataset = dataset
# add additional info
lore_dataset['columns'] = columns
lore_dataset['class_name'] = 'LABEL'
lore_dataset['possible_outcomes'] = possible_outcomes
lore_dataset['type_features'] = type_features
lore_dataset['features_type'] = features_type
lore_dataset['discrete'] = discrete
lore_dataset['continuous'] = continuous
lore_dataset['idx_features'] = idx_features
lore_dataset['label_encoder'] = label_encoder
return lore_dataset
def gimme_boston(datasets_folder="./datasets",return_lore_version=False):
# 1) Load data set and frame as pandas data frame
Xy = np.genfromtxt(datasets_folder+"/boston_housing.csv", delimiter=',')
X = Xy[:,:-1]
y = Xy[:,-1]
# make classification like T. Laugel
y = np.array([1 if x > 26.0 else 0 for x in y]).astype(int)
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
categorical_feature_names = ['CHAS']
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
boston_dict = dict()
for i, feat in enumerate(feature_names):
boston_dict[feat] = X[:,i]
boston_dict['LABEL'] = y
df = pd.DataFrame(boston_dict)
# 2) Set meaningful min and max intervals for the features
intervals = dict()
intervals['CRIM'] = (0.0, 100.0)
intervals['ZN'] = (0.0, 100.0)
intervals['INDUS'] = (0.0, 30.0)
intervals['CHAS'] = (0, 1) # this is category
intervals['NOX'] = (0.3, 1.0)
intervals['RM'] = (2,10)
intervals['AGE'] = (2.0, 100.0)
intervals['DIS'] = (0.5, 15.0)
intervals['RAD'] = (1, 24) # index, can be treated as a numerical variable
intervals['TAX'] = (150.0, 800.0)
intervals['PTRATIO'] = (10.0, 25.0)
intervals['B'] = (0.0, 400.0) # the description of this features seems to be incorrect
intervals['LSTAT'] = (1.0, 40.0)
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set perturbations and plausibility constraints of action
perturb, plausib = dict(), dict()
# CRIM - per capita crime rate by town
# the municipality can invest in, e.g., police, to reduce criminality; but it might also raise (probably less so if it is a feature we act upon)
perturb['CRIM'] = {'type':'relative', 'increase':0.05, 'decrease':0.01}
plausib['CRIM'] = None # the municipality can invest to reduce it, or cut funds to police so it might increase
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# perturbations can probably lead to *more* residential land zones appering than disappearing
perturb['ZN'] = {'type':'relative', 'increase':0.05, 'decrease':0.01}
plausib['ZN'] = None # the municipality can act to change this situation (e.g., via tax deducations/increases for housing)
# INDUS - proportion of non-retail business acres per town
# Similar to ZN, but probably more likely to decrease
perturb['INDUS'] = {'type':'relative', 'increase':0.01, 'decrease':0.02}
plausib['INDUS'] = None
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# no perturbations
perturb['CHAS'] = None
# not mutable
plausib['CHAS'] = '='
# NOX - nitric oxides concentration (parts per 10 million)
# perturbations more likely to increase this
perturb['NOX'] = {'type':'relative', 'increase':0.05, 'decrease':0.01}
plausib['NOX'] = None # investments can be done / abandoned to increase it or decrease it
# RM - average number of rooms per dwelling
# might vary because of how inhabitants behave because of several reasons
perturb['RM'] = {'type':'absolute', 'increase':1, 'decrease':1}
# assume we can put incentives to change this
plausib['RM'] = None
# AGE - proportion of owner-occupied units built prior to 1940
# assume it can only decrease. If left uncheck tends to decrease because new ones are built and existing fall apart
perturb['AGE'] = {'type':'relative', 'increase':0, 'decrease':0.05} # relative
plausib['AGE'] = '<='
# DIS - weighted distances to five Boston employment centres
# assume it is not mutable
perturb['DIS'] = None
plausib['DIS'] = '='
# RAD (this index will likely improve)
perturb['RAD'] = {'type':'absolute', 'increase':3, 'decrease':0} #absolute
plausib['RAD'] = None
# TAX - full-value property-tax rate per $10,000
# we can act to decrease or increase taxes.
perturb['TAX'] = {'type':'relative', 'increase':0.05, 'decrease':0.05} # relative
# perturb['TAX'] = {'type':'absolute', 'increase':1, 'decrease':1} #absolute
plausib['TAX'] = None
# PTRATIO - pupil-teacher ratio by town
# more likely to increase than decrease (pop growth)
perturb['PTRATIO'] = {'type':'relative', 'increase':0.05, 'decrease':0.03} # relative
# we can invest to change this quantity
plausib['PTRATIO'] = None
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# This is a controversial feature. Let's say we do not act on this; but it can change by itself
perturb['B'] = {'type':'relative', 'increase':0.05, 'decrease':0.05} # relative
plausib['B'] = '='
# LSTAT - % lower status of the population
# Not sure what is meant by this, is "lower status" meaning poorer/less educated?
# this is already a percentange so we treat it as absolute
perturb['LSTAT'] = {'type':'absolute', 'increase':5, 'decrease':5} #absolute
plausib['LSTAT'] = None # investments can be made / stopped to improve / make worse the situtation
# check that perturb and plausib have been filled
for feat in feature_names:
assert(feat in perturb)
assert(feat in plausib)
# 4) set label, prep numpy arrays
X = df[feature_names].to_numpy().astype(float)
y = df['LABEL'].to_numpy().astype('int')
# check that all is in order
assert(list(df.columns) == feature_names + ['LABEL'])
check_X_respects_intervals(X, feature_intervals, indices_categorical_features)
dataset = {
'name': 'boston',
'best_class' : 0,
'feature_names': feature_names,
'categorical_feature_names': categorical_feature_names,
'indices_categorical_features': indices_categorical_features,
'df': df,
'X': X,
'y': y,
'feature_intervals': feature_intervals,
'perturbations' : [perturb[key] for key in feature_names],
'plausibility_constraints' : [plausib[key] for key in feature_names]
}
if return_lore_version:
dataset = make_lore_compliant_dataset(dataset)
return dataset
def gimme_compas(datasets_folder="./datasets",return_lore_version=False):
# 1) Load data and do some preprocessing
df = pd.read_csv(datasets_folder+'/compas-scores-two-years.csv')
# I follow R. Guidotti's LORE here:
# https://github.com/riccotti/LORE/blob/710ffb42bf764bae90e9295e14349f0250fc2628/prepare_dataset.py#L100
# except for the fact that I also exclude age_cat, since we have age already
columns = ['age', 'sex', 'race', 'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid', 'decile_score', 'score_text']
df = df[columns]
# like Guidotti
del df['score_text']
df['days_b_screening_arrest'] = np.abs(df['days_b_screening_arrest'])
df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['length_of_stay'] = (df['c_jail_out'] - df['c_jail_in']).dt.days
df['length_of_stay'] = np.abs(df['length_of_stay'])
# different from Guidotti, we drop NAs instead of filling them in
df.dropna(axis=0, inplace=True)
# like Guidotti, remove jail in and out
del df['c_jail_in']
del df['c_jail_out']
df['length_of_stay'] = df['length_of_stay'].astype(int)
df['days_b_screening_arrest'] = df['days_b_screening_arrest'].astype(int)
# Guidotti fills missing values, we remove these rows
#df['length_of_stay'].fillna(df['length_of_stay'].value_counts().index[0], inplace=True)
#df['days_b_screening_arrest'].fillna(df['days_b_screening_arrest'].value_counts().index[0], inplace=True)
def get_class(x):
if x < 7:
return 0
else:
return 1
df['LABEL'] = df['decile_score'].apply(get_class)
del df['decile_score']
# keep top 2000 rows only
df = df[:2000]
feature_names = list(df.columns)
feature_names.remove('LABEL')
categorical_feature_names = ['sex', 'race', 'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid']
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
# convert categorical features to codes
for feat in categorical_feature_names:
df[feat] = pd.Categorical(df[feat])
df[feat] = df[feat].cat.codes
# 2) set up intervals
intervals = dict()
# for categorical features, put all possible categories
for feat in categorical_feature_names:
intervals[feat] = sorted(df[feat].unique().tolist())
# next define numerical features
intervals['age'] = (18, 100)
intervals['priors_count'] = (0, 100) # the max of the data set is 38 but I guess there is no real limit here
intervals['days_b_screening_arrest'] = (0, 1200) # the max of the data set is 1057, I cap it at 1200
intervals['length_of_stay'] = (0, 1200) # the max of the data set is 799, I cap this also at 1200
# check we inserted all features
for feat in feature_names:
assert(feat in intervals)
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set perturbations and plausibility constraints of action
perturb, plausib = dict(), dict()
# let us initialize them to None as many features cannot change
for feat in feature_names:
perturb[feat] = None
plausib[feat] = None
# Let's assume that we can actively decide to postpone the assessment by the black-box by up to 2 years, this means that age can increase
perturb['age'] = {'type':'absolute', 'increase':2, 'decrease':0} #absolute
plausib['age'] = '>='
# similarly, let's assume that we can actively decide to stay longer before assessment, by 2*365 days = 730
perturb['length_of_stay'] = {'type':'absolute', 'increase':730, 'decrease':0} #absolute
plausib['length_of_stay'] = '>='
# let's assume that we can be assessed up to two years later because of terrible delays in the system
perturb['days_before_screening_arrest'] = {'type':'absolute', 'increase':730, 'decrease':0} #absolute
plausib['days_before_screening_arrest'] = None
# Priors that were not counted for are discovered at the time of the assessment.
# Since the dataset's max is 38, let us say that up to 3 more can be found
# Let us also say that we might find that some info was incorrect, so priors can lower
perturb['priors_count'] = {'type':'absolute', 'increase':3, 'decrease': 2} #absolute
plausib['priors_count'] = None
# Recidism can become true or false if new priors are discovered, dismissed
perturb['is_recid'] = {'type':'absolute', 'categories': df['is_recid'].unique() }
plausib['is_recid'] = None
# similar reasoning as above
perturb['two_year_recid'] = {'type':'absolute', 'categories': df['two_year_recid'].unique() }
plausib['two_year_recid'] = None
# the same could be said for violent_recid but this is unlikely, i.e., unlikely that a prior that was not accounted for was actually a violent one
perturb['is_violent_recid'] = None
plausib['is_violent_recid'] = '='
# sex cannot change
perturb['sex'] = None
plausib['sex'] = '='
# check that perturb and plausib have been filled
for feat in feature_names:
assert(feat in perturb)
assert(feat in plausib)
# 4) Create numpy array and wrap it up
X = df[feature_names].to_numpy()
y = df['LABEL'].to_numpy().astype(int)
# check that all is in order
assert(list(df.columns) == feature_names + ['LABEL'])
check_X_respects_intervals(X, feature_intervals, indices_categorical_features)
dataset = {
'name': 'compas',
'best_class' : 0,
'feature_names': feature_names,
'categorical_feature_names': categorical_feature_names,
'indices_categorical_features': indices_categorical_features,
'df': df,
'X': X,
'y': y,
'feature_intervals': feature_intervals,
'perturbations' : [perturb[key] for key in feature_names],
'plausibility_constraints' : [plausib[key] for key in feature_names]
}
if return_lore_version:
dataset = make_lore_compliant_dataset(dataset)
return dataset
def gimme_adult(datasets_folder="./datasets", return_lore_version=False):
# 1) Load data and do some preprocessing
df = pd.read_csv(datasets_folder+'/adult.csv', delimiter=',', skipinitialspace=True)
# I follow a pre-processing similar to R. Guidotti's LORE:
# https://github.com/riccotti/LORE/blob/710ffb42bf764bae90e9295e14349f0250fc2628/prepare_dataset.py
# remove so-called useless columns as done by R. Guidotti
del df['fnlwgt']
# Guidotti removes education-num and keeps education, however
# education is an index rather than a class, so here we keep the numerical feature
# (e.g. assuming you can improve your education-num by following specialization courses)
#del df['education-num']
del df['education']
# remove rows with missing Values
df.replace('?', np.nan, inplace=True)
df.dropna(axis=0, inplace=True)
# preprocess categorical features
categorical_feature_names = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# convert categorical features to codes
for feat in categorical_feature_names:
df[feat] = pd.Categorical(df[feat])
df[feat] = df[feat].cat.codes
# change y's label
df['LABEL'] = df['income']
del df['income']
# all feature names
feature_names = list(df.columns)
feature_names.remove('LABEL')
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
# 2) set up intervals
intervals = dict()
# let's intilize categorical features
for feat in categorical_feature_names:
intervals[feat] = sorted(df[feat].unique().tolist())
# if you want to print intervals from df:
#for feat in [x for x in feature_names if x not in categorical_feature_names]:
# print(feat, df[feat].min(), df[feat].max())
intervals['age'] = (17, 90) # like min and max from dataset
intervals['education-num'] = (2, 16) # like min and max from dataset
intervals['capital-gain'] = (0, 99999) # like min and max from dataset
intervals['capital-loss'] = (0, 4500) # the max of the "full" dataset is 4356, making it a nice round number
intervals['hours-per-week'] = (1, 99) # like min and max from "full" dataset
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set perturbations and plausibility constraints of action
perturb, plausib = dict(), dict()
# let us initialize them in bunch
for feat in feature_names:
if 'workclass'==feat or 'marital-status'==feat or 'occupation'==feat or 'relationship'==feat:
# these can vary, let us assume it is not happening when we decide to act on them
perturb[feat] = {'type':'absolute', 'categories': df[feat].unique() }
plausib[feat] = None
elif 'race'==feat or 'native-country'==feat:
# you cannot change your race nor native country
perturb[feat] = None
plausib[feat] = '=' # must stay the same
elif 'sex'==feat:
# no perturbations possible
perturb[feat] = None
plausib[feat] = '=' # must stay the same
elif feat == 'age':
# let us say that, if we decide to wait and get older, we can have a delay of half a year happening
# but if this is not something we actively focus on, delays can cause us to age, say up to 2 years
perturb[feat] = {'type':'absolute', 'increase':2, 'decrease':0} #absolute
# age can only increase
plausib[feat] = '>='
elif feat == 'capital-gain':
# it can happen that we gain less/more than we want due to some circumstances
# probably more likely to loose gain than to... gain gain
perturb[feat] = {'type':'relative', 'increase':0.05, 'decrease':0.1}
plausib[feat] = None
elif feat == 'capital-loss':
# opposite of capital gain
perturb[feat] = {'type':'relative', 'increase':0.1, 'decrease':0.05}
plausib[feat] = None
elif feat == 'hours-per-week':
# e.g. need to work less / more for some weeks due to some circumstances
perturb[feat] = {'type':'absolute', 'increase':5, 'decrease':5}
plausib[feat] = None
elif feat == 'education-num':
# this does not change by itself, and you can only improve your education level
perturb[feat] = None
plausib[feat] = '>='
else:
raise ValueError("Not handled feat", feat)
# check that perturb and plausib have been filled
for feat in feature_names:
assert(feat in perturb)
assert(feat in plausib)
# 4) set label, prep numpy arrays
X = df[feature_names].to_numpy()
y = df['LABEL'].to_numpy().astype(int)
# check that all is in order
assert(list(df.columns) == feature_names + ['LABEL'])
check_X_respects_intervals(X, feature_intervals, indices_categorical_features)
dataset = {
'name': 'adult',
'best_class' : 1,
'feature_names': feature_names,
'categorical_feature_names': categorical_feature_names,
'indices_categorical_features': indices_categorical_features,
'df': df,
'X': X,
'y': y,
'feature_intervals': feature_intervals,
'perturbations' : [perturb[key] for key in feature_names],
'plausibility_constraints' : [plausib[key] for key in feature_names]
}
if return_lore_version:
dataset = make_lore_compliant_dataset(dataset)
return dataset
def gimme_credit(datasets_folder="./datasets",return_lore_version=False):
# 1) Load data set and frame as pandas data frame
df = pd.read_csv(datasets_folder+'/south_german_credit.csv')
# Note: some features are categorical indices (larger number = better) and we consider them to be numerical
# (assuming small changes are possible, of course)
# these are:
'''
present_emp_since,
savings,
account_check_status,
credit_history,
housing,
job,
property
'''
# preprocess categorical features
categorical_feature_names = ['purpose', 'personal_status_sex',
'other_debtors', 'other_installment_plans', 'telephone', 'foreign_worker']
for feat in categorical_feature_names:
df[feat] = pd.Categorical(df[feat])
df[feat] = df[feat].cat.codes
# change y's label
df['LABEL'] = df['credit_risk']
del df['credit_risk']
# note: in south german credit, as opposed to older version of german credit,
# credit_risk = 0 means bad, credit_risk = 1 means good
# set feature names
feature_names = list(df.columns)
feature_names.remove('LABEL')
# keep track of which features are categorical
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
# 2) set up intervals
intervals = dict()
# intervals for categorical features to their possibilities
for feat in categorical_feature_names:
intervals[feat] = sorted(df[feat].unique().tolist())
# set numerical feature intervals
# let's just initialize them to max and min in the data, then refine some
for feat in feature_names:
if feat not in categorical_feature_names:
intervals[feat] = (df[feat].min(), df[feat].max())
intervals['age'] = (18, 75) # min of data set is 19, max is 75
intervals['duration_in_month'] = (3, 75) # data set min and max are 4, 72, let's allow a bit more wiggle room
intervals['credit_amount'] = (250, 20000) # data set min and max are 250, 18,424; the latter seems arbitrary so let's cap at 20,000 instead
# check we set everything
for feat in feature_names:
assert (feat in intervals)
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set perturbations and plausibility constraints of action
perturb, plausib = dict(), dict()
# let's start with numerical features
# can get older, say up to half-a-year of delay in the assessment
# cannot get younger
perturb['age'] = {'type':'absolute', 'increase':0.5, 'decrease':0}
plausib['age'] = '>='
# the income might vary and thus this quantity
perturb['installment_as_income_perc'] = {'type':'relative', 'increase':0.10, 'decrease':0.10}
plausib['installment_as_income_perc'] = None
# how long one has been at the current residence
# cannot vary without our control (sure one could be evicted but let's assume that's extremely rare)
perturb['present_res_since'] = None
# this can grow or drop (e.g., to 0) if one relocates; for simplicity let's just assume that one does not change residence (hence >=)
plausib['present_res_since'] = '>='
# duration of the credit
# assume perturbations can change it a bit relative to what was asked for
# (but less likely to decrease than increase)
perturb['duration_in_month'] = {'type':'relative', 'increase':0.25, 'decrease':0.05}
plausib['duration_in_month'] = None
# assume purpose does not change
perturb['purpose'] = None
# also: not plausible to change purpose
# (pretty useless to know that if you want a house instead of a yacht, then they give you the credit for that)
plausib['purpose'] = '='
# housing type: assume no perturbations similar to present_res_since
perturb['housing'] = None
# but let's say it is plausible to change house
plausib['housing'] = None
# series of features which might vary
for feat in ['account_check_status','credit_this_bank']:
perturb[feat] = {'type':'absolute', 'increase':1, 'decrease':1}
plausib[feat] = None
# plausible to go both ways (we can ask less or more credit)
# let's say that events may happen that require us to ask a bit more or a bit less
perturb['credit_amount'] = {'type':'relative', 'increase':0.1, 'decrease':0.1}
plausib['credit_amount'] = None
perturb['savings'] = {'type':'relative', 'increase':0.1, 'decrease':0.1}
plausib['savings'] = None
# similar reasoning to present_res_since
perturb['present_emp_since'] = None
plausib['present_emp_since'] = '>='
# some features we assume cannot vary due to external events but are plausible to intentionally change
for feat in ['other_installment_plans', 'credits_this_bank', 'job']:
perturb[feat], plausib[feat] = None, None
# some features that we assume cannot vary nor change
for feat in ['credit_history','personal_status_sex','other_debtors','property',
'telephone', 'foreign_worker', 'people_under_maintenance']:
perturb[feat], plausib[feat] = None, '='
# check we filled them all
for feat in feature_names:
assert(feat in perturb)
assert(feat in plausib)
# 4) set label, prep numpy arrays
X = df[feature_names].to_numpy()
y = df['LABEL'].to_numpy().astype(int)
# check that all is in order
assert(list(df.columns) == feature_names + ['LABEL'])
check_X_respects_intervals(X, feature_intervals, indices_categorical_features)
dataset = {
'name': 'credit',
'best_class' : 1,
'feature_names': feature_names,
'categorical_feature_names': categorical_feature_names,
'indices_categorical_features': indices_categorical_features,
'df': df,
'X': X,
'y': y,
'feature_intervals': feature_intervals,
'perturbations' : [perturb[key] for key in feature_names],
'plausibility_constraints' : [plausib[key] for key in feature_names]
}
if return_lore_version:
dataset = make_lore_compliant_dataset(dataset)
return dataset
def gimme_garments(datasets_folder="./datasets", return_lore_version=False):
# 1) Load data and do some preprocessing
df = pd.read_csv(datasets_folder+"/garments_worker_productivity.csv", delimiter=',')
# remove date and wip (a lot of missing values in the latter)
df.drop(['date','wip'], axis=1, inplace=True)
# remove duplicate rows
mask_unique = [not d for d in df.drop('actual_productivity', axis=1).duplicated()]
df = df[mask_unique]
# create (quasi-)balanced ternary label
df['LABEL'] = 0
df.loc[df.actual_productivity > .7, 'LABEL'] = 1
df.loc[df.actual_productivity > .81, 'LABEL'] = 2
del df['actual_productivity']
feature_names = df.columns.to_list()
feature_names.remove('LABEL')
categorical_feature_names = ['quarter', 'department', 'day', 'team', 'no_of_style_change']
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
# convert categorical features to codes
for feat in categorical_feature_names:
df[feat] = pd.Categorical(df[feat])
df[feat] = df[feat].cat.codes
# 2) Set meaningful min and max intervals for the features
# (for categoricals we store all categories)
intervals = dict()
for feat in categorical_feature_names:
intervals[feat] = sorted(df[feat].unique().tolist())
intervals['smv'] = (2.5, 60) # similar to min and max, rounded them a bit
intervals['over_time'] = (0, 25920) # the max of the data set is 25920 min = 18 days
intervals['incentive'] = (0, 3600) # just like min and max of data set
intervals['idle_time'] = (0, 300) # like min and max of the data set
intervals['idle_men'] = (0, 50) # min and max are 0 45, we round the latter up
intervals['no_of_workers'] = (1, 100) # min and max are 2 89, we imagine it can be 1 to 100
intervals['targeted_productivity'] = (0.05, 0.8) # min and max are 0.07 and 0.8
# check we set them all
for feat in feature_names:
assert(feat in intervals.keys())
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set perturbations and plausibility constraints of action
# we assume we are the company manager, who acts to improve their workers' productivity
perturb, plausib = dict(), dict()
# Team: Associated team number with the instance
# Cannot be perturbed, but it is actionable if we assume that teams can be re-assigned
perturb['team'] = None
plausib['team'] = None
# day : Day of the Week
# let's assume that delays can change the scheduled day for production to change
perturb['day'] = {'type':'absolute', 'categories': df['day'].unique() }
# we can decide to change the day to do a certain task as we like
plausib['day'] = None
# quarter: A portion of the month. A month was divided into four quarters
# similar reasoning to day
perturb['quarter'] = {'type':'absolute', 'categories': df['quarter'].unique() }
# we can decide to change the quarter as we like
plausib['quarter'] = None
# department: Associated department with the instance
# we assume they cannot vary beyond our control
perturb['department'] = None
# we can vary it as we like
plausib['department'] = None
# no_of_workers: Number of workers in each team
# workers might quit/become ill for example. they cannot be hired without we willingly want to.
perturb['no_of_workers'] = {'type':'relative', 'increase':0, 'decrease':0.1}
# assume we can let off or hire personell
plausib['no_of_workers'] = None
# no_of_style_change : Number of changes in the style of a particular product
# assume this cannot be perturbed
perturb['no_of_style_change'] = None
# we can act to modify how much style changes there should be
plausib['no_of_style_change'] = None
# targeted_productivity : Targeted productivity set by the Authority for each team for each day.
# it's a fixed goal, cannot be perturbed but can be changed willingly
perturb['targeted_productivity'] = None
plausib['targeted_productivity'] = None
# smv : Standard Minute Value, it is the allocated time for a task
# Let's assume the "real" smv is subject to perturbations
perturb['smv'] = {'type':'relative', 'increase':0.1, 'decrease':0.1}
# we can change how much time we think should be allocated for a task
plausib['smv'] = None
# over_time : Represents the amount of overtime by each team in minutes
# Let's assume that this can be subject to perturbations
# max is 25920 min = 18 days. Let's say perturbations can cause up to +-3 days
perturb['over_time'] = {'type':'absolute', 'increase':3*24*60, 'decrease':3*24*60}
# We can demand more or less overtime
plausib['over_time'] = None
# incentive : Represents the amount of financial incentive (in BDT) that enables or motivates a particular course of action.
# Assume it is not subject to perturbations
perturb['incentive'] = None
# the company can change it
plausib['incentive'] = None
# idle_time : The amount of time when the production was interrupted due to several reasons
# Probably more likely that idle time increses rather than decreases due to perturbations
perturb['idle_time'] = {'type':'relative', 'increase':0.1, 'decrease':0.05}
# we assume the company can act to change this
plausib['idle_time'] = None
# idle_men : The number of workers who were idle due to production interruption
# Same reasoning as for idle_time
perturb['idle_men'] = {'type':'relative', 'increase':0.1, 'decrease':0.05}
# can take actions to change this
plausib['idle_men'] = None
# check that we filled them all
for feat in feature_names:
assert(feat in perturb)
assert(feat in plausib)
# 4) set label, prep numpy arrays
X = df[feature_names].to_numpy().astype(float)
y = df['LABEL'].to_numpy().astype('int')
# check that all is in order
assert(list(df.columns) == feature_names + ['LABEL'])
dataset = {
'name': 'garments',
'best_class' : 2,
'feature_names': feature_names,
'categorical_feature_names': categorical_feature_names,
'indices_categorical_features': indices_categorical_features,
'df': df,
'X': X,
'y': y,
'feature_intervals': feature_intervals,
'perturbations' : [perturb[key] for key in feature_names],
'plausibility_constraints' : [plausib[key] for key in feature_names]
}
if return_lore_version:
dataset = make_lore_compliant_dataset(dataset)
return dataset
def gimme_student(datasets_folder="./datasets", return_lore_version=False):
# 1) Load data and do some preprocessing
df = pd.read_csv(datasets_folder+"/student_mat.csv", delimiter=',')
# remove unused labels G1 & G2
df.drop(['G1','G2'], axis=1, inplace=True)
# create ternary label (balanced)
df['G3'] = df['G3'].astype(int)
df['LABEL'] = 0
df.loc[df['G3'] > 9, 'LABEL'] = 1
df.loc[df['G3'] > 12, 'LABEL'] = 2
del df['G3']
feature_names = df.columns.to_list()
feature_names.remove('LABEL')
categorical_feature_names = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason',
'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'failures']
indices_categorical_features = [i for i, f in enumerate(feature_names) if f in categorical_feature_names]
# convert categorical features to codes
for feat in categorical_feature_names:
df[feat] = pd.Categorical(df[feat])
df[feat] = df[feat].cat.codes
# 2) Set meaningful min and max intervals for the features
# (for categoricals we store all categories)
intervals = dict()
for feat in categorical_feature_names:
intervals[feat] = sorted(df[feat].unique().tolist())
intervals['age'] = (15, 24) # similar to min and max, allowed for a slightly larger max
intervals['Medu'] = (0, 4) # min and max
intervals['Fedu'] = (0, 4) # min and max
intervals['traveltime'] = (1, 4) # min and max
intervals['studytime'] = (1, 4) # min and max
intervals['famrel'] = (1, 5) # min and max
intervals['freetime'] = (1, 5) # min and max
intervals['goout'] = (1, 5) # min and max
intervals['Dalc'] = (1, 5) # min and max
intervals['Walc'] = (1, 5) # min and max
intervals['health'] = (1, 5) # min and max
intervals['absences'] = (0, 100) # similar to min (1) and max (93)
# check we set them all
for feat in feature_names:
assert(feat in intervals.keys())
feature_intervals = np.array([np.array(intervals[feat]) for feat in feature_names], dtype=object)
# 3) Set a's, b's, and plausibility constraints
# we assume we are the company manager, who acts to improve their workers' productivity
perturb, plausib = dict(), dict()
# 1 school
# the type of school cannot be changed by a perturbation
perturb['school'] = None
# we assume the student can change school
plausib['school'] = None
# 2 sex - student's sex
# of course cannot change by itself
perturb['sex'] = None
# a person can change gender but we doubt that's a plausible recommendation
plausib['sex'] = '='
# 3 age - student's age
# external circumstances can cause delays (e.g., the student is ill and skips one year)
perturb['age'] = {'type':'absolute', 'increase':1, 'decrease':0}
# age can only increase
plausib['age'] = '>='
# 4 address - student's home address type (binary: "U" - urban or "R" - rural)
# the family might decide to relocate irrespective of the situation of the student
perturb['address'] = {'type':'absolute', 'categories': df['address'].unique() }
# the family can in fact decide to act i.e. move, to e.g. have the student be closer to the school
plausib['address'] = None
# 5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
# Events might change the size of the family itself (e.g., a baby is born, a grandparent passes)
perturb['famsize'] = {'type':'absolute', 'categories': df['famsize'].unique() }
# one does not take action on increasing or decreasing the family size to improve a student's grade
plausib['famsize'] = "="
# 6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
# might change by itself, for the better or worse...
perturb['Pstatus'] = {'type':'absolute', 'categories': df['Pstatus'].unique() }
# Similarly to famsize, hard to prescribe parents to get back together...
plausib['Pstatus'] = "="
# For the following features, we assume they cannot really change by perturbations
# nor the parents are likely gonna make it to act sufficiently fast for these to change
# (note: jobs can become "at_home" if parents get fired, but let's positively assume that's very unlikely)
for feat in ['Medu','Fedu','Mjob','Fjob', 'reason', 'failures']:
perturb[feat], plausib[feat] = None, '='
# 12 guardian - student's guardian (nominal: "mother", "father" or "other")
# Might change if parents separate, one passes, etc.
perturb['guardian'] = {'type':'absolute', 'categories': df['guardian'].unique() }
# Might willingly want to change that (e.g., recommend the student to live with the (sperated) father if closer to school)
plausib['guardian'] = None
# 13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
# Let's say that can be reduced or increased because of several reasons
# Less change much by perturbations if this is something we act upon
perturb['traveltime'] = {'type':'absolute', 'increase':3, 'decrease':3}
# let's assume that we can act to reduce this (e.g., by a different means of transporation)
plausib['traveltime'] = None
# 14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
# Assume perturbations can only make this worse
perturb['studytime'] = {'type':'absolute', 'increase':0, 'decrease':3}
plausib['studytime'] = None
# 16 schoolsup - extra educational support (binary: yes or no)
# Assume actionable and also mutable by perturbations
perturb['schoolsup'] = {'type':'absolute', 'categories': df['schoolsup'].unique() }
plausib['schoolsup'] = None
# 17 famsup - family educational support (binary: yes or no)
# like above probably
perturb['famsup'] = {'type':'absolute', 'categories': df['famsup'].unique() }
plausib['famsup'] = None
# assume that the following cannot have perturbations but can be changed
for feat in ['paid','activities','nursery','higher','internet']:
perturb[feat], plausib[feat] = None, None
# 23 romantic - with a romantic relationship (binary: yes or no)
# can change by perturbations beyond the student or student's family control
perturb['romantic'] = {'type':'absolute', 'categories': df['romantic'].unique() }
# not so plausible to recommend a student to change his/her relationship status (or is it?)
plausib['romantic'] = '='
# 24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
# let us assume that can change and acted upon to improve
perturb['famrel'] = {'type':'absolute', 'increase':1, 'decrease':1}
plausib['famrel'] = None
# 25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
# similar reasoning to famrel
perturb['freetime'] = {'type':'absolute', 'increase':1, 'decrease':1}
plausib['freetime'] = None
# 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
# similar to famrel
perturb['goout'] = {'type':'absolute', 'increase':2, 'decrease':1}
plausib['goout'] = None
# 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
perturb['Dalc'] = {'type':'absolute', 'increase':2, 'decrease':1}
plausib['Dalc'] = None
# 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
perturb['Walc'] = {'type':'absolute', 'increase':2, 'decrease':1}
plausib['Walc'] = None
# 29 health - current health status (numeric: from 1 - very bad to 5 - very good)
perturb['health'] = {'type':'absolute', 'increase':1, 'decrease':3}
plausib['health'] = None
# 30 absences - number of school absences (numeric: from 0 to 93)
perturb['absences'] = {'type':'absolute', 'increase':10, 'decrease':10}
plausib['absences'] = None