Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Statsmodels api #23

Merged
merged 7 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
.Rhistory
.RData
.Ruserdata
.Rproj
.Rproj
__pycache__/
*.egg-info/
51 changes: 41 additions & 10 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,43 +12,74 @@ If you'd like to cooperate, either send us an

## Example

Note: everything is still very much work in progress, and there are multiple errors in the code that I am aware of. Still, I believe that the implementation of the WCR11 is more or less correct.
Note: everything is still very much work in progress. Still, I believe that the implementation of the WCR11 and WCU 11 is more or less correct.

```
import wildboottest.wildboottest as wb
from wildboottest.wildboottest import wildboottest, Wildboottest
import statsmodels.api as sm
import numpy as np
import timeit
import time

N = 100000
k = 100
G= 50
N = 1000
k = 10
G= 12
X = np.random.normal(0, 1, N * k).reshape((N,k))
beta = np.random.normal(0,1,k)
beta[0] = 0.005
u = np.random.normal(0,1,N)
Y = 1 + X @ beta + u
cluster = np.random.choice(list(range(0,G)), N)
B = 99999


model = sm.OLS(Y, X)
model.exog
results = model.fit(cov_type = 'cluster', cov_kwds = {
'groups': cluster
})
results.summary()
# >>> results.summary()
# <class 'statsmodels.iolib.summary.Summary'>
# """
# OLS Regression Results
# =======================================================================================
# Dep. Variable: y R-squared (uncentered): 0.799
# Model: OLS Adj. R-squared (uncentered): 0.797
# Method: Least Squares F-statistic: 790.3
# Date: Sat, 15 Oct 2022 Prob (F-statistic): 3.16e-14
# Time: 12:03:43 Log-Likelihood: -1784.6
# No. Observations: 1000 AIC: 3589.
# Df Residuals: 990 BIC: 3638.
# Df Model: 10
# Covariance Type: cluster
# ==============================================================================
# coef std err z P>|z| [0.025 0.975]
# ------------------------------------------------------------------------------
# x1 0.0128 0.064 0.200 0.841 -0.113 0.138
wildboottest(model, "X1", cluster, B)
# 0.8408408408408409


# execute, method by method

# some preparations
bootcluster = cluster
R = np.zeros(k)
R[0] = 1
B = 99999

start_time = timeit.default_timer()
wcr = wb.Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
wcr.get_scores(bootstrap_type = "11", impose_null = True)
wcr.get_weights(weights_type = "rademacher")
wcr.get_numer()
wcr.get_denom()
wcr.numer
wcr.denom
wcr.get_tboot()
wcr.t_boot
wcr.get_vcov()
wcr.get_tstat()
wcr.get_pvalue(pval_type = "two-tailed")
print("estimation time:", timeit.default_timer() - start_time)
# >>> 0.9225496 seconds
print("p value:", wcr.pvalue)
# >>> p value: 0.15258152581525816
# >>> p value: 0.8408408408408409
```
56 changes: 25 additions & 31 deletions tests/bootalgo-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def WCR11_not_WCU11():
G= 20
X = np.random.normal(0, 1, N * k).reshape((N,k))
beta = np.random.normal(0,1,k)
beta[0] = 1
beta[0] = 0.1
u = np.random.normal(0,1,N)
y = 1 + X @ beta + u
cluster = np.random.choice(list(range(0,G)), N)
Expand All @@ -17,37 +17,31 @@ def WCR11_not_WCU11():
R[0] = 1
B = 999

wcr = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
wcr.get_scores(bootstrap_type = "11", impose_null = True)
wcr.get_weights(weights_type = "rademacher")
wcr.get_numer()
wcr.get_denom()
wcr.numer
wcr.denom
wcr.get_tboot()
wcr.t_boot
wcr.get_vcov()
wcr.get_tstat()
wcr.get_pvalue(pval_type = "two-tailed")
boot = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
boot.get_scores(bootstrap_type = "11", impose_null = True)
boot.get_weights(weights_type = "rademacher")
boot.get_numer()
boot.get_denom()
boot.get_tboot()
boot.get_vcov()
boot.get_tstat()
boot.get_pvalue(pval_type = "two-tailed")

wcu = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
wcu.get_scores(bootstrap_type = "11", impose_null = True)
wcr.get_weights(weights_type = "rademacher")
wcu = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
wcu.get_scores(bootstrap_type = "11", impose_null = False)
wcu.get_weights(weights_type = "rademacher")
wcu.get_numer()
wcu.get_denom()
wcu.numer
wcu.denom
wcu.get_tboot()
wcu.t_boot
wcu.get_vcov()
wcu.get_tstat()
wcu-(pval_type = "two-tailed")
wcu.get_pvalue(pval_type = "two-tailed")

# score matrices of WCR11 and WCU11 should be different - currently not the case
assert not np.array_equal(wcr.scores_mat, wcu.scores_mat)
assert not np.array_equal(wcr.t_boot, wcu.t_boot)
assert np.array_equal(wcr.t_stat, wcu.t_stat)
assert not np.array_equal(wcr.pvalue, wcu.pvalue) # unless both pvals are zero or 1...
assert not np.array_equal(boot.scores_mat, wcu.scores_mat)
assert not np.array_equal(boot.t_boot, wcu.t_boot)
assert np.array_equal(boot.t_stat, wcu.t_stat)
assert not np.array_equal(boot.pvalue, wcu.pvalue) # unless both pvals are zero or 1...


def test_r_vs_py():
Expand Down Expand Up @@ -80,14 +74,14 @@ def full_enum_works():
B = 99999

wcr = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
wcr.get_scores(bootstrap_type = "11", impose_null = False)
wcr.get_weights(weights_type = "rademacher")
wcr.get_numer()
wcr.get_denom()
wcr.get_tboot()
boot.get_scores(bootstrap_type = "11", impose_null = False)
boot.get_weights(weights_type = "rademacher")
boot.get_numer()
boot.get_denom()
boot.get_tboot()

assert len(wcr.t_boot) == 2**G
assert wcr.full_enumeration == True
assert len(boot.t_boot) == 2**G
assert boot.full_enumeration == True



Expand Down
Binary file modified wildboottest/__pycache__/wildboottest.cpython-36.pyc
Binary file not shown.
139 changes: 135 additions & 4 deletions wildboottest/wildboottest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
self.N_G_bootcluster = len(bootclustid)
self.G = len(clustid)

k = R.shape[0]
self.k = k
self.k = R.shape[0]
self.B = B
self.X = X
self.R = R
Expand All @@ -61,8 +60,8 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
y_list = []
tXgXg_list = []
tXgyg_list = []
tXX = np.zeros((k, k))
tXy = np.zeros(k)
tXX = np.zeros((self.k, self.k))
tXy = np.zeros(self.k)

#all_cluster = np.unique(bootcluster)

Expand Down Expand Up @@ -355,4 +354,136 @@ def draw_weights(t : str, full_enumeration: bool, N_G_bootcluster: int, boot_ite
return [v, boot_iter]


def wildboottest(model, cluster, B, param = None, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None):

'''
Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'

Args:
model(statsmodels.regression.linear_model.OLS'): A statsmodels regression object
param(str): A string of length one, containing the test parameter of interest
cluster(np.array): A numpy array of dimension one, containing the clustering variable.
B(int): The number of bootstrap iterations to run
weights_type(str): The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'.
'rademacher' by default.
impose_null(logical): Should the null hypothesis be imposed on the bootstrap dgp, or not?
True by default.
bootstrap_type(str). A string of length one. Allows to choose the bootstrap type
to be run. Either '11', '31', '13' or '33'. '11' by default.
seed(int). Option to provide a random seed.

Returns:
A wild cluster bootstrapped p-value.

Example:

import statsmodels.api as sm
import numpy as np

np.random.seed(12312312)
N = 1000
k = 10
G= 10
X = np.random.normal(0, 1, N * k).reshape((N,k))
beta = np.random.normal(0,1,k)
beta[0] = 0.005
u = np.random.normal(0,1,N)
Y = 1 + X @ beta + u
cluster = np.random.choice(list(range(0,G)), N)

model = sm.OLS(Y, X)

boottest(model, param = "X1", cluster = cluster, B = 9999)

'''

# does model.exog already exclude missing values?
X = model.exog
# interestingly, the dependent variable is called 'endogeneous'
Y = model.endog
# weights not yet used, only as a placeholder
weights = model.weights

xnames = model.data.xnames
ynames = model.data.ynames

pvalues = []
tstats = []

def generate_stats(param):

R = np.zeros(len(xnames))
R[xnames.index(param)] = 1
# Just test for beta=0

# is it possible to fetch the clustering variables from the pre-processed data
# frame, e.g. with 'excluding' observations with missings etc
# cluster = ...

# set bootcluster == cluster for one-way clustering
bootcluster = cluster

boot = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster,
R = R, B = B, seed = seed)
boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null)
boot.get_weights(weights_type = weights_type)
boot.get_numer()
boot.get_denom()
boot.get_tboot()
boot.get_vcov()
boot.get_tstat()
boot.get_pvalue(pval_type = "two-tailed")

pvalues.append(boot.pvalue)
tstats.append(boot.t_stat)

return pvalues, tstats

if param is None:
for x in xnames:
pvalues, tstats = generate_stats(x)
elif isinstance(param, str):
pvalues, tstats = generate_stats(param)
else:
raise Exception("`param` not correctly specified")

return np.array(pvalues), np.array(tstats).flatten()

if __name__ == '__main__':
import statsmodels.api as sm
import numpy as np

np.random.seed(12312312)
N = 1000
k = 10
G= 10
X = np.random.normal(0, 1, N * k).reshape((N,k))
beta = np.random.normal(0,1,k)
beta[0] = 0.005
u = np.random.normal(0,1,N)
Y = 1 + X @ beta + u
cluster = np.random.choice(list(range(0,G)), N)

X_df = pd.DataFrame(data=X, columns = [f"col_{i}" for i in range(k)])

Y_df = pd.DataFrame(data=Y, columns = ['outcome'])

model = sm.OLS(Y_df, X_df)

print("--- WCB ---")
print(model.fit().summary())

print("--- NonRobust ---")
print(model.fit(cov_type='wildclusterbootstrap',
cov_kwds = {'cluster' : cluster,
'B' : 9999,
'weights_type' : 'rademacher',
'impose_null' : True,
'bootstrap_type' : '11',
'seed' : None}).summary())




# boottest(model, param = "X1", cluster = cluster, B = 9999)