-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbw_select.py
194 lines (140 loc) · 4.55 KB
/
bw_select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
'''
Functions ported from the R package sm.
Implements different bandwidth selection methods, including:
- Scott's rule of thumb
- Silverman's rule of thumb
- Sheather-Jones estimator
'''
# Shamelessly taken from here: https://github.com/Neojume/pythonABC/blob/master/hselect.py
# Good source of how to estimate your bw if you dont know the modality of underlying data: https://aakinshin.net/posts/kde-bw/
import numpy as np
__all__ = ['wmean',
'wvar',
'dnorm',
'hsilverman',
'hscott',
'hnorm',
'hsj']
class normal(object):
'''
The 1D normal (or Gaussian) distribution.
'''
@staticmethod
def pdf(x, mu, sigma):
return np.exp(normal.logpdf(x, mu, sigma))
@staticmethod
def logpdf(x, mu, sigma):
return -0.5 * np.log(2.0 * np.pi) - np.log(sigma) - \
0.5 * ((x - mu) ** 2) / (sigma ** 2)
@staticmethod
def rvs(mu, sigma, N=1):
return np.random.normal(mu, sigma, N)
def wmean(x, w):
'''
Weighted mean
'''
return sum(x * w) / float(sum(w))
def wvar(x, w):
'''
Weighted variance
'''
return sum(w * (x - wmean(x, w)) ** 2) / float(sum(w) - 1)
def dnorm(x):
return normal.pdf(x, 0.0, 1.0)
def bowman(x):
pass
# TODO: implement?
#hx = median(abs(x - median(x))) / 0.6745 * (4 / 3 / r.n) ^ 0.2
#hy = median(abs(y - median(y))) / 0.6745 * (4 / 3 / r.n) ^ 0.2
#h = sqrt(hy * hx)
def hsilverman(x, weights=None):
IQR = np.percentile(x, 75) - np.percentile(x, 25)
A = min(np.std(x, ddof=1), IQR / 1.349)
if weights is None:
weights = np.ones(len(x))
n = float(sum(weights))
return 0.9 * A * n ** (-0.2)
def hscott(x, weights=None):
IQR = np.percentile(x, 75) - np.percentile(x, 25)
A = min(np.std(x, ddof=1), IQR / 1.349)
if weights is None:
weights = np.ones(len(x))
n = float(sum(weights))
return 1.059 * A * n ** (-0.2)
def hnorm(x, weights=None):
'''
Bandwidth estimate assuming f is normal. See paragraph 2.4.2 of
Bowman and Azzalini[1]_ for details.
References
----------
.. [1] Applied Smoothing Techniques for Data Analysis: the
Kernel Approach with S-Plus Illustrations.
Bowman, A.W. and Azzalini, A. (1997).
Oxford University Press, Oxford
'''
x = np.asarray(x)
if weights is None:
weights = np.ones(len(x))
n = float(sum(weights))
if len(x.shape) == 1:
sd = np.sqrt(wvar(x, weights))
return sd * (4 / (3 * n)) ** (1 / 5.0)
# TODO: make this work for more dimensions
# ((4 / (p + 2) * n)^(1 / (p+4)) * sigma_i
if len(x.shape) == 2:
ndim = x.shape[1]
sd = np.sqrt(np.apply_along_axis(wvar, 1, x, weights))
return (4.0 / ((ndim + 2.0) * n) ** (1.0 / (ndim + 4.0))) * sd
def hsj(x, weights=None):
'''
Sheather-Jones bandwidth estimator [1]_.
References
----------
.. [1] A reliable data-based bandwidth selection method for kernel
density estimation. Simon J. Sheather and Michael C. Jones.
Journal of the Royal Statistical Society, Series B. 1991
'''
h0 = hnorm(x)
v0 = sj(x, h0)
if v0 > 0:
hstep = 1.1
else:
hstep = 0.9
h1 = h0 * hstep
v1 = sj(x, h1)
while v1 * v0 > 0:
h0 = h1
v0 = v1
h1 = h0 * hstep
v1 = sj(x, h1)
return h0 + (h1 - h0) * abs(v0) / (abs(v0) + abs(v1))
def sj(x, h):
'''
Equation 12 of Sheather and Jones [1]_
References
----------
.. [1] A reliable data-based bandwidth selection method for kernel
density estimation. Simon J. Sheather and Michael C. Jones.
Journal of the Royal Statistical Society, Series B. 1991
'''
phi6 = lambda x: (x ** 6 - 15 * x ** 4 + 45 * x ** 2 - 15) * dnorm(x)
phi4 = lambda x: (x ** 4 - 6 * x ** 2 + 3) * dnorm(x)
n = len(x)
one = np.ones((1, n))
lam = np.percentile(x, 75) - np.percentile(x, 25)
a = 0.92 * lam * n ** (-1 / 7.0)
b = 0.912 * lam * n ** (-1 / 9.0)
W = np.tile(x, (n, 1))
W = W - W.T
W1 = phi6(W / b)
tdb = np.dot(np.dot(one, W1), one.T)
tdb = -tdb / (n * (n - 1) * b ** 7)
W1 = phi4(W / a)
sda = np.dot(np.dot(one, W1), one.T)
sda = sda / (n * (n - 1) * a ** 5)
alpha2 = 1.357 * (abs(sda / tdb)) ** (1 / 7.0) * h ** (5 / 7.0)
W1 = phi4(W / alpha2)
sdalpha2 = np.dot(np.dot(one, W1), one.T)
sdalpha2 = sdalpha2 / (n * (n - 1) * alpha2 ** 5)
return (normal.pdf(0, 0, np.sqrt(2)) /
(n * abs(sdalpha2[0, 0]))) ** 0.2 - h