-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
87 lines (68 loc) · 2.71 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
def print_confusion_matrix(true, pred):
cm = confusion_matrix(true, pred)
df = pd.DataFrame(cm, columns=['Inlier (True)', 'Outlier (True)'],
index=['Inlier (Pred)', 'Outlier (Pred)'])
print(df)
def plot_dbscan_results(dbscan_obj, df):
labels = dbscan_obj.labels_
core_sample_idx = dbscan_obj.core_sample_indices_
components = dbscan_obj.components_
# plot prices, colored by label
fig, (top, bot) = plt.subplots(2, 1, figsize=(15,14))
# plot styles
_unique_labels, _label_counts = np.unique(labels, return_counts=True)
_pal = sns.husl_palette(len(_unique_labels), h=.5)
# colormap, with outliers set to grey
_cmap = dict(zip(_unique_labels, _pal))
_cmap[-1] = 'grey'
# x values are weeks
x = list(range(df.shape[1]))
# top: all prices
# labels, regions
for lab, region in zip(labels, df.index):
price = df.loc[region].values
_alpha = 0.5
if lab == -1:
# plot dotted lines for outliers on top and bottom
top.plot(x, price, color=_cmap[lab], ls=':', alpha=0.3, zorder=1)
bot.plot(x, price, color=_cmap[lab], ls=':', alpha=0.3, zorder=1)
else:
# plot prices on top
top.plot(x, price, color=_cmap[lab], ls='-', alpha=_alpha, zorder=2)
# bottom: summary plot
# indices of core samples
for idx in core_sample_idx:
# core component from data
# don't use DBSCAN.components_, b/c this doesn't use raw data?...
comp = df.iloc[idx]
# get the label for the core component for coloring purposes
lab = labels[idx]
_clr = _cmap[lab]
# plot core component on bottom
bot.plot(x, comp, color=_clr)
# bottom: ranges for components
# unique labels
for lab in _unique_labels:
# get data that was matched with the label
df_label_mask = labels == lab
df_subset = df.loc[df_label_mask]
# get min/max values
# Note: this is a vector
_ymin = df_subset.min()
_ymax = df_subset.max()
# color of label
_clr = _cmap[lab]
# plot fill-between of min/max on bottom
# don't plot outliers
if lab != -1:
bot.fill_between(x, _ymin, _ymax, color=_clr, alpha=0.3)
top.set_title('DBSCAN Results for Avocado Prices')
bot.set_title('DBSCAN Components for Avocado Prices')
bot.set_xlabel('Week');
top.set_ylabel('Average Price');
bot.set_ylabel('Average Price');