-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutility.py
155 lines (127 loc) · 5.72 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import pandas as pd
import collections
import pydotplus
from sklearn import tree
def important_nodes_generator(classifier, dataframe_x, list_y):
applied = classifier.apply(dataframe_x)
important_nodes = list()
for elem in range(len(list_y)):
if list_y[elem] != 0:
important_nodes.append(applied[elem])
return important_nodes
def y_creator(dataframe_x, dataframe_y):
"""
:param dataframe_x: main dataframe
:param dataframe_y: dataframe with tuples that are congruent with the result
:return: y list, reordered dataframe_x
Note: dataframe_x must have less column than dataframe_y
"""
dataframe_x = dataframe_x.sort_values(by=dataframe_x.columns.tolist())
dataframe_y = dataframe_y.sort_values(by=dataframe_x.columns.tolist())
col = len(dataframe_x.columns)
y = [1] * dataframe_x.shape[0]
count = 0
for row in range(dataframe_x.shape[0]):
if count < dataframe_y.shape[0]:
for c in range(col):
if dataframe_x.iloc[row, c] != dataframe_y.iloc[count, c]:
y[row] = 0
if y[row] == 1:
count += 1
else:
for r in range(row, dataframe_x.shape[0]):
y[r] = 0
return dataframe_x, dataframe_y, y
def transform_y_to_all_results(dataframe_x, dataframe_results):
"""
:param dataframe_x: main data frame
:param dataframe_results: data frame of results that are visible to the user
:return: y with all the tuples that have the attributes of dataframe_result also with columns 'isfree' and 'tupleset'
"""
dataframe_results = dataframe_results.drop_duplicates()
result = pd.DataFrame()
for row in range(dataframe_results.shape[0]):
new_rows = dataframe_x
for column in range(len(dataframe_results.columns)):
new_rows = new_rows[
(new_rows[dataframe_results.columns.values[column]] == dataframe_results.iloc[row, column])]
new_rows['tupleset'] = [row] * new_rows.shape[0]
if new_rows.shape[0] == 1:
new_rows['isfree'] = [0] * new_rows.shape[0]
else:
new_rows['isfree'] = [1] * new_rows.shape[0]
result = pd.concat([result, new_rows], axis=0)
print('\nThis is y:')
print(result)
return result
def tree_features_calculator(classifier, x, list_y):
"""
The purity value indicates if the IEQ ('OR' of important nodes queries) is exact.
If the value is 100%, then the IEQ is exact, i.e. it contains all an only the tuples
of the input 'result'. Otherwise it contains also tuples that are not in the result,
and the final IEQ must be considered approximated.
N.B.1: +++ Function to be fixed!!! +++
The impurity value is not correctly calculated if the value is below 100%.
This because maybe some 'unwanted' tuples (left value of value=[1,1] in a leaf node)
are in a wanted free set.
N.B.2: +++ Enhancement +++
When the function that prunes nodes discarding some tuples from the result will
be implemented, it can be useful to indicate, beside the purity,
also if there are extra or fewer tuples in the result.
:param classifier:
:param x:
:param list_y:
:return:
"""
important_nodes = important_nodes_generator(classifier, x, list_y)
important_nodes = set(important_nodes)
left_count = 0
right_count = 0
features = list(x.columns.values)
dot_data = tree.export_graphviz(classifier,
feature_names=features,
out_file=None,
filled=True,
rounded=True,
node_ids=True)
graph = pydotplus.graph_from_dot_data(dot_data)
edges = collections.defaultdict(list)
for edge in graph.get_edge_list():
edges[edge.get_source()].append(int(edge.get_destination()))
for edge in edges:
edges[edge].sort()
for i in range(2):
dest = graph.get_node(str(edges[edge][i]))[0]
if int(dest.get_label().split("#")[1].split("\\")[0]) in important_nodes:
left_count += int(dest.get_label().split("[")[1].split(',')[0])
right_count += int(dest.get_label().split(", ")[1].split(']')[0])
_, important_nodes_heights = heights_of_important_nodes(classifier, list_y, x)
max_height = max(important_nodes_heights)
purity = 100 * right_count / (left_count + right_count)
number_important_nodes = len(set(important_nodes))
return purity, max_height, number_important_nodes
def heights_of_important_nodes(classifier, list_y, x):
n_nodes = classifier.tree_.node_count
children_left = classifier.tree_.children_left
children_right = classifier.tree_.children_right
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# If we have a test node
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
index = 0
index = 0
important_nodes = important_nodes_generator(classifier, x, list_y)
altitude_of_important_nodes = [-1] * len(important_nodes)
for important_node in important_nodes:
altitude_of_important_nodes[index] = node_depth[important_node]
index += 1
return node_depth, altitude_of_important_nodes