-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathassistant_cv.py
140 lines (110 loc) · 4.58 KB
/
assistant_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from __future__ import print_function, division
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from watson_developer_cloud import AssistantV1
from watson_developer_cloud.assistant_v1 import CreateIntent, CreateExample
from itertools import groupby, repeat
from operator import itemgetter
from threading import Thread
from multiprocessing.pool import ThreadPool
import numpy as np
import argparse
import os.path
import unicodecsv as csv
def make_folds(data, k, monte_carlo=True):
"""
Creates an iterator over k folds from the input data.
Args:
data (list): A list of (example, label) pairs.
k (int): The number of folds.
monte_carlo (boolean): A flag to shuffle the data before creating folds.
Returns:
iterator: An iterator over a list of folds.
"""
skf = StratifiedKFold(n_splits=k, shuffle=monte_carlo)
folds = skf.split(data[:, 0], data[:, 1])
return folds
def load_data(data_path):
"""
Loads numpy friendly data from a workspace export.
Args:
data_path (str): The path to the workspace exported data.
Returns:
ndarray: A numpy array with first column containing examples,
and second column containing corresponding labels.
"""
if not os.path.isfile(data_path):
raise TypeError('{} is not a valid file'.format(data_path))
with open(data_path, mode='rb') as data_file:
csv_reader = csv.reader(data_file, encoding='utf-8')
data = [line for line in csv_reader]
return np.array(data)
def validate_with_fold(fold, data):
"""
Creates a workspace from the fold's training data, and
computes classification metrics based on the fold's testing data.
Args:
fold (tuple): A tuple of train and test indices.
data (ndarray): A numpy 2d array of the intents and examples.
Returns:
str: A classification report string from the created workspace.
"""
# create an assistant service instance with your credentials
assistant = AssistantV1(username=os.environ['ASSISTANT_USERNAME'],
password=os.environ['ASSISTANT_PASSWORD'],
version='2017-05-26')
train_indices, test_indices = fold
train_data = data[train_indices]
test_data = data[test_indices]
train_map = {label: map(CreateExample, [example[0] for example in examples])
for label, examples in groupby(train_data, itemgetter(1))}
create_intent_objs = [CreateIntent(label, examples=examples)
for label, examples in train_map.items()]
res = assistant.create_workspace(
name='k-fold-test', intents=create_intent_objs)
wid = res['workspace_id']
# wait until training is done
while assistant.get_workspace(wid)['status'] != 'Available':
pass
# validate the workspace with the corresponding test data
y_true, y_pred = [], []
for example, label in test_data:
res = assistant.message(workspace_id=wid,
input={'text': example})
# ignore counter examples
if len(res['intents']) < 1:
continue
top_label = res['intents'][0]['intent']
y_true.append(label)
y_pred.append(top_label)
# delete the created workspace
assistant.delete_workspace(workspace_id=wid)
# return the classification report string
return classification_report(y_true, y_pred)
def validate_with_fold_helper(params):
return validate_with_fold(*params)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data', required=True,
help='path to the workspace csv export')
parser.add_argument('--folds', required=False, default=3,
type=int, help='number of folds for cross-validation')
args = parser.parse_args()
# get the command line arguments
data_path = os.path.expanduser(args.data)
num_folds = args.folds
# load the data
data = load_data(data_path)
# create the folds
folds = make_folds(data, num_folds)
# create worker threads
pool = ThreadPool(processes=num_folds)
# fire up the workers
classification_reports = pool.map(
validate_with_fold_helper, zip(folds, repeat(data, num_folds)))
pool.close()
pool.join()
for fold_number, report in enumerate(classification_reports, 1):
print('******************fold_{}_results********************\n'.format(fold_number))
print(report)
print('****************************************************\n')