-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
396 lines (309 loc) · 15.9 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
__author__ = 'fjordonez'
import os
import zipfile
import argparse
import numpy as np
import _pickle as cp
from io import BytesIO
from pandas import Series
# Hardcoded number of sensor channels employed in the OPPORTUNITY challenge
NB_SENSOR_CHANNELS = 113
def select_subject(dataset_name, test):
if dataset_name == 'Opportunity':
## Test set for the opportunity challenge.
if test == 'challenge':
train_runs = ['S1-Drill','S1-ADL1','S1-ADL2','S1-ADL3','S1-ADL4','S2-Drill','S2-ADL1','S2-ADL2','S3-Drill','S3-ADL1','S3-ADL2','S2-ADL3','S3-ADL3']
val_runs = ['S1-ADL5']
test_runs = ['S2-ADL4','S2-ADL5','S3-ADL4','S3-ADL5']
train_files = ['OpportunityUCIDataset/dataset/{}.dat'.format(run) for run in train_runs]
val_files = ['OpportunityUCIDataset/dataset/{}.dat'.format(run) for run in val_runs]
test_files = ['OpportunityUCIDataset/dataset/{}.dat'.format(run) for run in test_runs]
else:
train = ['1','2','3','4']
runs = ['Drill','ADL1','ADL2','ADL3','ADL4','ADL5']
val_runs = ['ADL5']
test_files = ['OpportunityUCIDataset/dataset/S{}-{}.dat'.format(test,run) for run in runs]
train.remove(test)
runs.remove(val_runs[0])
train_files = ['OpportunityUCIDataset/dataset/S{}-{}.dat'.format(sub,run) for sub in train for run in runs]
val_files = ['OpportunityUCIDataset/dataset/S{}-{}.dat'.format(sub,run) for sub in train for run in val_runs]
if dataset_name == 'Daphnet':
if int(test)<10:
test = '0'+test
subjects = ['01','02','03','04','05','06','07','08','09','10']
runs = [2,2,3,1,2,2,2,1,1,1] # Number of recorded runs for each subject in daphnet dataset.
## Choose a subject at random to validate on.
train = subjects
train.remove(test)
val = np.random.choice(train)
train.remove(val)
filenum = 0
train_files = ['dataset_fog_release/dataset/S{}R0{}.txt'.format(sub,run+1) for sub in train for run in range(0,runs[int(sub)-1])]
test_files = ['dataset_fog_release/dataset/S{}R0{}.txt'.format(test,run+1) for run in range(0,runs[int(test)-1])]
val_files = ['dataset_fog_release/dataset/S{}R0{}.txt'.format(val,run+1) for run in range(0,runs[int(val)-1])]
return train_files, test_files, val_files
# Hardcoded thresholds to define global maximums and minimums for every one of the 113 sensor channels employed in the
# OPPORTUNITY challenge
NORM_MAX_THRESHOLDS = [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
250, 25, 200, 5000, 5000, 5000, 5000, 5000, 5000,
10000, 10000, 10000, 10000, 10000, 10000, 250, 250, 25,
200, 5000, 5000, 5000, 5000, 5000, 5000,
10000, 10000,
10000, 10000, 10000, 10000 , 250 ]
NORM_MIN_THRESHOLDS = [-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-250, -100, -200, -5000, -5000, -5000, -5000, -5000, -5000,
-10000, -10000, -10000, -10000, -10000, -10000, -250, -250, -100,
-200, -5000, -5000, -5000, -5000, -5000, -5000,
-10000, -10000,
-10000, -10000, -10000, -10000 ,-250 ]
def select_columns_opp(data):
"""Selection of the 113 columns employed in the OPPORTUNITY challenge
:param data: numpy integer matrix
Sensor data (all features)
:return: numpy integer matrix
Selection of features
"""
## ACC/GYRO ONLY
# features_delete = np.arange(43, 50) #Exclude quats and magnetometer reading from BACK
# features_delete = np.concatenate([features_delete, np.arange(56, 63)]) #Exclude quats and magnetometer reading from RUA
# features_delete = np.concatenate([features_delete, np.arange(69, 76)]) #Exclude quats and magnetometer reading from RLA
# features_delete = np.concatenate([features_delete, np.arange(82, 89)]) #Exclude quats and magnetometer reading from LUA
# features_delete = np.concatenate([features_delete, np.arange(95, 134)]) #Exclude quats and magnetometer reading from LLA and shoes
# features_delete = np.concatenate([features_delete, np.arange(134, 243)]) #Exclude ambient item sensors
# features_delete = np.concatenate([features_delete, np.arange(244, 249)])
## ACC/GYRO/MAG/QUAT ONLY
features_delete = np.arange(46, 50)
features_delete = np.concatenate([features_delete, np.arange(59, 63)])
features_delete = np.concatenate([features_delete, np.arange(72, 76)])
features_delete = np.concatenate([features_delete, np.arange(85, 89)])
features_delete = np.concatenate([features_delete, np.arange(98, 102)])
features_delete = np.concatenate([features_delete, np.arange(134, 243)])
features_delete = np.concatenate([features_delete, np.arange(244, 249)])
return np.delete(data, features_delete, 1)
def normalize(data, max_list, min_list):
"""Normalizes all sensor channels
:param data: numpy integer matrix
Sensor data
:param max_list: numpy integer array
Array containing maximums values for every one of the 113 sensor channels
:param min_list: numpy integer array
Array containing minimum values for every one of the 113 sensor channels
:return:
Normalized sensor data
"""
max_list, min_list = np.array(max_list), np.array(min_list)
diffs = max_list - min_list
for i in np.arange(data.shape[1]):
data[:, i] = (data[:, i]-min_list[i])/diffs[i]
# Checking the boundaries
data[data > 1] = 0.99
data[data < 0] = 0.00
return data
def divide_x_y(data, label):
"""Segments each sample into features and label
:param data: numpy integer matrix
Sensor data
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer matrix, numpy integer array
Features encapsulated into a matrix and labels as an array
"""
if label in ['locomotion','gestures']:
data_x = data[:, 1:NB_SENSOR_CHANNELS+1]
if label == 'locomotion':
data_y = data[:, NB_SENSOR_CHANNELS+1] # Locomotion label
elif label == 'gestures':
data_y = data[:, NB_SENSOR_CHANNELS+2] # Gestures label
elif label == -1:
data_x = data[:,1:-1]
data_y = data[:,-1]
else:
raise RuntimeError("Invalid label: '%s'" % label)
return data_x, data_y
def adjust_idx_labels(data_y, label):
"""Transforms original labels into the range [0, nb_labels-1]
:param data_y: numpy integer array
Sensor labels
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer array
Modified sensor labels
"""
if label == 'locomotion': # Labels for locomotion are adjusted
data_y[data_y == 4] = 3
data_y[data_y == 5] = 4
elif label == 'gestures': # Labels for gestures are adjusted
data_y[data_y == 406516] = 1
data_y[data_y == 406517] = 2
data_y[data_y == 404516] = 3
data_y[data_y == 404517] = 4
data_y[data_y == 406520] = 5
data_y[data_y == 404520] = 6
data_y[data_y == 406505] = 7
data_y[data_y == 404505] = 8
data_y[data_y == 406519] = 9
data_y[data_y == 404519] = 10
data_y[data_y == 406511] = 11
data_y[data_y == 404511] = 12
data_y[data_y == 406508] = 13
data_y[data_y == 404508] = 14
data_y[data_y == 408512] = 15
data_y[data_y == 407521] = 16
data_y[data_y == 405506] = 17
return data_y
def check_data(data_set):
"""Try to access to the file and checks if dataset is in the data directory
In case the file is not found try to download it from original location
:param data_set:
Path with original OPPORTUNITY zip file
:return:
"""
print('Checking dataset {0}'.format(data_set))
data_dir, data_file = os.path.split(data_set)
# When a directory is not provided, check if dataset is in the data directory
if data_dir == "" and not os.path.isfile(data_set):
new_path = os.path.join(os.path.split(__file__)[0], "data", data_set)
if os.path.isfile(new_path) or data_file == 'OpportunityUCIDataset.zip':
data_set = new_path
# When dataset not found, try to download it from UCI repository
if (not os.path.isfile(data_set)) and data_file == 'OpportunityUCIDataset.zip':
print('... dataset path {0} not found'.format(data_set))
import urllib
origin = (
'https://archive.ics.uci.edu/ml/machine-learning-databases/00226/OpportunityUCIDataset.zip'
)
if not os.path.exists(data_dir):
print('... creating directory {0}'.format(data_dir))
os.makedirs(data_dir)
print('... downloading data from {0}'.format(origin))
urllib.request.urlretrieve(origin, data_set)
return data_dir
def process_dataset_file(dataset_name, data, label):
"""Function defined as a pipeline to process individual OPPORTUNITY files
:param data: numpy integer matrix
Matrix containing data samples (rows) for every sensor channel (column)
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer matrix, numy integer array
Processed sensor data, segmented into features (x) and labels (y)
"""
if dataset_name == 'Opportunity':
# Select correct columns
data = select_columns_opp(data)
# Colums are segmentd into features and labels
data_x, data_y = divide_x_y(data, label)
data_y = adjust_idx_labels(data_y, label)
data_y = data_y.astype(int)
# Perform linear interpolation
data_x = np.array([Series(i).interpolate() for i in data_x.T]).T
# Remaining missing data are converted to zero
data_x[np.isnan(data_x)] = 0
# All sensor channels are normalized
data_x = normalize(data_x, NORM_MAX_THRESHOLDS, NORM_MIN_THRESHOLDS)
if dataset_name == 'Daphnet':
data_x, data_y = divide_x_y(data, -1)
# Perform linear interpolation
data_x = np.array([Series(i).interpolate() for i in data_x.T]).T
# Remaining missing data are converted to zero
data_x[np.isnan(data_x)] = 0
# Redesignate classes from 0, 1, 2 to 0, 1 = not freeze, freeze.
data_y = data_y.astype(int)
reindex = [0,0,1]
data_y = np.array([reindex[y] for y in data_y])
return data_x, data_y
def generate_data(dataset_name, dataset, test_sub, label):
"""Function to read the OPPORTUNITY challenge raw data and process all sensor channels
:param dataset: string
Path with original OPPORTUNITY zip file
:param target_filename: string
Processed file
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized. The OPPORTUNITY dataset includes several annotations to perform
recognition modes of locomotion/postures and recognition of sporadic gestures.
"""
data_dir = check_data(dataset)
train_files, test_files, val_files = select_subject(dataset_name, test_sub)
zf = zipfile.ZipFile(dataset)
print('Processing dataset files ...')
try:
os.mkdir('data')
except FileExistsError: # Remove data if already there.
for file in os.scandir('data'):
if 'data' in file.name:
os.remove(file.path)
# Generate training files
print('Generating training files')
for i,filename in enumerate(train_files):
try:
data = np.loadtxt(BytesIO(zf.read(filename)))
print('... file {} -> train_data_{}'.format(filename,i))
x, y = process_dataset_file(dataset_name, data, label)
with open('data/train_data_{}'.format(i),'wb') as f:
cp.dump((x,y),f)
except KeyError:
print('ERROR: Did not find {} in zip file'.format(filename))
# Generate validation files
print('Generating validation files')
for i,filename in enumerate(val_files):
try:
data = np.loadtxt(BytesIO(zf.read(filename)))
print('... file {} -> val_data_{}'.format(filename,i))
x, y = process_dataset_file(dataset_name, data, label)
with open('data/val_data_{}'.format(i),'wb') as f:
cp.dump((x,y),f)
except KeyError:
print('ERROR: Did not find {} in zip file'.format(filename))
# Generate testing files
print('Generating testing files')
for i,filename in enumerate(test_files):
try:
data = np.loadtxt(BytesIO(zf.read(filename)))
print('... file {} -> test_data_{}'.format(filename,i))
x, y = process_dataset_file(dataset_name, data, label)
with open('data/test_data_{}'.format(i),'wb') as f:
cp.dump((x,y),f)
except KeyError:
print('ERROR: Did not find {} in zip file'.format(filename))
def find_data(name):
dataset_dir = 'data/raw/'
dataset_names = {'Opportunity':'OpportunityUCIDataset.zip','Daphnet':'dataset_fog_release.zip','PAMAP2':'PAMAP2_Dataset.zip'}
dataset = dataset_dir + dataset_names[name]
return dataset
def get_args():
'''This function parses and return arguments passed in'''
parser = argparse.ArgumentParser(
description='Preprocess OPPORTUNITY dataset')
# Add arguments
# parser.add_argument(
# '-i', '--input', type=str, help='OPPORTUNITY zip file', required=True)
parser.add_argument(
'-s','--subject', type=str, help='Subject to leave out for testing', required=True)
parser.add_argument(
'-t', '--task', type=str.lower, help='Type of activities to be recognized (for opportunity)', default="gestures", choices = ["gestures", "locomotion"], required=False)
parser.add_argument(
'-d', '--dataset', type=str, help='Name of dataset.', default="Opportunity", choices = ["Daphnet", "Opportunity"], required=False)
# Array for all arguments passed to script
args = parser.parse_args()
# Assign args to variables
dataset = args.dataset
subject = args.subject
label = args.task
# Return all variable values
return dataset, subject, label
# if __name__ == '__main__':