-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathShapiroWilksTest.m
287 lines (231 loc) · 9.53 KB
/
ShapiroWilksTest.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
function [H, pValue, W] = ShapiroWilksTest(x, alpha, tail)
%SWTEST Shapiro-Wilk parametric hypothesis test of composite normality.
% [H, pValue, SWstatistic] = SWTEST(X, ALPHA, TAIL) performs
% the Shapiro-Wilk test to determine if the null hypothesis of
% composite normality is a reasonable assumption regarding the
% population distribution of a random sample X. The desired significance
% level, ALPHA, is an optional scalar input (default = 0.05).
% TAIL indicates the type of test (default = 1).
%
% The Shapiro-Wilk hypotheses are:
% Null Hypothesis: X is normal with unspecified mean and variance.
% For TAIL = 0 (2-sided test), alternative: X is not normal.
% For TAIL = 1 (1-sided test), alternative: X is upper the normal.
% For TAIL = -1 (1-sided test), alternative: X is lower the normal.
%
% This is an omnibus test, and is generally considered relatively
% powerful against a variety of alternatives.
% Shapiro-Wilk test is better than the Shapiro-Francia test for
% Platykurtic sample. Conversely, Shapiro-Francia test is better than the
% Shapiro-Wilk test for Leptokurtic samples.
%
% When the series 'X' is Leptokurtic, SWTEST performs the Shapiro-Francia
% test, else (series 'X' is Platykurtic) SWTEST performs the
% Shapiro-Wilk test.
%
% [H, pValue, SWstatistic] = SWTEST(X, ALPHA, TAIL)
%
% Inputs:
% X - a vector of deviates from an unknown distribution. The observation
% number must exceed 3 and less than 5000.
%
% Optional inputs:
% ALPHA - The significance level for the test (default = 0.05).
%
% TAIL - The type of the test (default = 1).
%
% Outputs:
% SWstatistic - The test statistic (non normalized).
%
% pValue - is the p-value, or the probability of observing the given
% result by chance given that the null hypothesis is true. Small values
% of pValue cast doubt on the validity of the null hypothesis.
%
% H = 0 => Do not reject the null hypothesis at significance level ALPHA.
% H = 1 => Reject the null hypothesis at significance level ALPHA.
%
%
% References: Royston P. "Algorithm AS R94", Applied Statistics (1995) Vol. 44, No. 4.
% AS R94 -- calculates Shapiro-Wilk normality test and P-value
% for sample sizes 3 <= n <= 5000. Handles censored or uncensored data.
% Corrects AS 181, which was found to be inaccurate for n > 50.
%
%
% Ensure the sample data is a VECTOR.
%
if numel(x) == length(x)
x = x(:); % Ensure a column vector.
else
error(' Input sample ''X'' must be a vector.');
end
%
% Remove missing observations indicated by NaN's and check sample size.
%
x = x(~isnan(x));
if length(x) < 3
error(' Sample vector ''X'' must have at least 3 valid observations.');
end
if length(x) > 5000
warning('Shapiro-Wilk test might be inaccurate due to large sample size ( > 5000).');
end
%
% Ensure the significance level, ALPHA, is a
% scalar, and set default if necessary.
%
if (nargin >= 2) && ~isempty(alpha)
if numel(alpha) > 1
error(' Significance level ''Alpha'' must be a scalar.');
end
if (alpha <= 0 || alpha >= 1)
error(' Significance level ''Alpha'' must be between 0 and 1.');
end
else
alpha = 0.05;
end
%
% Ensure the type-of-test indicator, TAIL, is a scalar integer from
% the allowable set {-1 , 0 , 1}, and set default if necessary.
%
if (nargin >= 3) && ~isempty(tail)
if numel(tail) > 1
error('Type-of-test indicator ''Tail'' must be a scalar.');
end
if (tail ~= -1) && (tail ~= 0) && (tail ~= 1)
error('Type-of-test indicator ''Tail'' must be -1, 0, or 1.');
end
else
tail = 1;
end
% First, calculate the a's for weights as a function of the m's
% See Royston (1995) for details in the approximation.
x = sort(x); % Sort the vector X in ascending order.
n = length(x);
mtilde = norminv(((1:n)' - 3/8) / (n + 0.25));
weights = zeros(n,1); % Preallocate the weights.
if kurtosis(x) > 3
% The Shapiro-Francia test is better for leptokurtic samples.
weights = 1/sqrt(mtilde'*mtilde) * mtilde;
%
% The Shapiro-Francia statistic W is calculated to avoid excessive rounding
% errors for W close to 1 (a potential problem in very large samples).
%
W = (weights' * x) ^2 / ((x - mean(x))' * (x - mean(x)));
nu = log(n);
u1 = log(nu) - nu;
u2 = log(nu) + 2/nu;
mu = -1.2725 + (1.0521 * u1);
sigma = 1.0308 - (0.26758 * u2);
newSFstatistic = log(1 - W);
%
% Compute the normalized Shapiro-Francia statistic and its p-value.
%
NormalSFstatistic = (newSFstatistic - mu) / sigma;
% the next p-value is for the tail = 1 test.
pValue = 1 - normcdf(NormalSFstatistic, 0, 1);
else
% The Shapiro-Wilk test is better for platykurtic samples.
c = 1/sqrt(mtilde'*mtilde) * mtilde;
u = 1/sqrt(n);
PolyCoef_1 = [-2.706056 , 4.434685 , -2.071190 , -0.147981 , 0.221157 , c(n)];
PolyCoef_2 = [-3.582633 , 5.682633 , -1.752461 , -0.293762 , 0.042981 , c(n-1)];
PolyCoef_3 = [-0.0006714 , 0.0250540 , -0.39978 , 0.54400];
PolyCoef_4 = [-0.0020322 , 0.0627670 , -0.77857 , 1.38220];
PolyCoef_5 = [0.00389150 , -0.083751 , -0.31082 , -1.5861];
PolyCoef_6 = [0.00303020 , -0.082676 , -0.48030];
PolyCoef_7 = [0.459 , -2.273];
weights(n) = polyval(PolyCoef_1 , u);
weights(1) = -weights(n);
% Special attention when n=3 (this is a special case).
if n == 3
weights(1) = 0.707106781;
weights(n) = -weights(1);
end
if n >= 6
weights(n-1) = polyval(PolyCoef_2 , u);
weights(2) = -weights(n-1);
count = 3;
phi = (mtilde'*mtilde - 2 * mtilde(n)^2 - 2 * mtilde(n-1)^2) / ...
(1 - 2 * weights(n)^2 - 2 * weights(n-1)^2);
else
count = 2;
phi = (mtilde'*mtilde - 2 * mtilde(n)^2) / ...
(1 - 2 * weights(n)^2);
end
%
% The vector 'WEIGHTS' obtained next corresponds to the same coefficients
% listed by Shapiro-Wilk in their original test for small samples.
%
weights(count : n-count+1) = mtilde(count : n-count+1) / sqrt(phi);
%
% The Shapiro-Wilk statistic W is calculated to avoid excessive rounding
% errors for W close to 1 (a potential problem in very large samples).
%
W = (weights' * x) ^2 / ((x - mean(x))' * (x - mean(x)));
%
% Calculate the significance level for W (exact for n=3).
%
newn = log(n);
if (n > 3) && (n <= 11)
mu = polyval(PolyCoef_3 , n);
sigma = exp(polyval(PolyCoef_4 , n));
gam = polyval(PolyCoef_7 , n);
newSWstatistic = -log(gam-log(1-W));
elseif n >= 12
mu = polyval(PolyCoef_5 , newn);
sigma = exp(polyval(PolyCoef_6 , newn));
newSWstatistic = log(1 - W);
elseif n == 3
mu = 0;
sigma = 1;
newSWstatistic = 0;
end
%
% Compute the normalized Shapiro-Wilk statistic and its p-value.
%
NormalSWstatistic = (newSWstatistic - mu) / sigma;
% The next p-value is for the tail = 1 test.
pValue = 1 - normcdf(NormalSWstatistic, 0, 1);
% Special attention when n=3 (this is a special case).
if n == 3
pValue = 1.909859 * (asin(sqrt(W)) - 1.047198);
NormalSWstatistic = norminv(pValue, 0, 1);
end
end
% The p-value just found is for the tail = 1 test.
if tail == 0
pValue = 2 * min(pValue, 1-pValue);
elseif tail == -1
pValue = 1 - pValue;
end
%
% To maintain consistency with existing Statistics Toolbox hypothesis
% tests, returning 'H = 0' implies that we 'Do not reject the null
% hypothesis at the significance level of alpha' and 'H = 1' implies
% that we 'Reject the null hypothesis at significance level of alpha.'
%
H = (alpha >= pValue);
%
% Copyright (c) 2009, Ahmed Ben Saïda
% All rights reserved.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are
% met:
%
% * Redistributions of source code must retain the above copyright
% notice, this list of conditions and the following disclaimer.
% * Redistributions in binary form must reproduce the above copyright
% notice, this list of conditions and the following disclaimer in
% the documentation and/or other materials provided with the distribution
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
% POSSIBILITY OF SUCH DAMAGE.