From 7676d250f4f5ef0d770ebde8048d3afe992a4fe6 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Fri, 14 Oct 2022 18:25:34 +0200 Subject: [PATCH 1/6] some fixes in tests --- tests/bootalgo-tests.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/bootalgo-tests.py b/tests/bootalgo-tests.py index 7ce479c..0bcb2d5 100644 --- a/tests/bootalgo-tests.py +++ b/tests/bootalgo-tests.py @@ -8,7 +8,7 @@ def WCR11_not_WCU11(): G= 20 X = np.random.normal(0, 1, N * k).reshape((N,k)) beta = np.random.normal(0,1,k) - beta[0] = 1 + beta[0] = 0.1 u = np.random.normal(0,1,N) y = 1 + X @ beta + u cluster = np.random.choice(list(range(0,G)), N) @@ -22,26 +22,20 @@ def WCR11_not_WCU11(): wcr.get_weights(weights_type = "rademacher") wcr.get_numer() wcr.get_denom() - wcr.numer - wcr.denom wcr.get_tboot() - wcr.t_boot wcr.get_vcov() wcr.get_tstat() wcr.get_pvalue(pval_type = "two-tailed") - wcu = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341) - wcu.get_scores(bootstrap_type = "11", impose_null = True) - wcr.get_weights(weights_type = "rademacher") + wcu = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341) + wcu.get_scores(bootstrap_type = "11", impose_null = False) + wcu.get_weights(weights_type = "rademacher") wcu.get_numer() wcu.get_denom() - wcu.numer - wcu.denom wcu.get_tboot() - wcu.t_boot wcu.get_vcov() wcu.get_tstat() - wcu-(pval_type = "two-tailed") + wcu.get_pvalue(pval_type = "two-tailed") # score matrices of WCR11 and WCU11 should be different - currently not the case assert not np.array_equal(wcr.scores_mat, wcu.scores_mat) From 146888a8f377a20dbc343f70fbaa2ae5384575d1 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Sat, 15 Oct 2022 12:07:09 +0200 Subject: [PATCH 2/6] add wildboottest function --- readme.md | 51 +++++++++--- tests/bootalgo-tests.py | 40 ++++----- .../__pycache__/wildboottest.cpython-36.pyc | Bin 8349 -> 10459 bytes wildboottest/wildboottest.py | 77 ++++++++++++++++++ 4 files changed, 138 insertions(+), 30 deletions(-) diff --git a/readme.md b/readme.md index 42daabb..5d6b31e 100644 --- a/readme.md +++ b/readme.md @@ -12,27 +12,61 @@ If you'd like to cooperate, either send us an ## Example -Note: everything is still very much work in progress, and there are multiple errors in the code that I am aware of. Still, I believe that the implementation of the WCR11 is more or less correct. +Note: everything is still very much work in progress. Still, I believe that the implementation of the WCR11 and WCU 11 is more or less correct. ``` -import wildboottest.wildboottest as wb +from wildboottest.wildboottest import wildboottest, Wildboottest +import statsmodels.api as sm import numpy as np import timeit import time -N = 100000 -k = 100 -G= 50 +N = 1000 +k = 10 +G= 12 X = np.random.normal(0, 1, N * k).reshape((N,k)) beta = np.random.normal(0,1,k) beta[0] = 0.005 u = np.random.normal(0,1,N) Y = 1 + X @ beta + u cluster = np.random.choice(list(range(0,G)), N) +B = 99999 + + +model = sm.OLS(Y, X) +model.exog +results = model.fit(cov_type = 'cluster', cov_kwds = { + 'groups': cluster +}) +results.summary() +# >>> results.summary() +# +# """ +# OLS Regression Results +# ======================================================================================= +# Dep. Variable: y R-squared (uncentered): 0.799 +# Model: OLS Adj. R-squared (uncentered): 0.797 +# Method: Least Squares F-statistic: 790.3 +# Date: Sat, 15 Oct 2022 Prob (F-statistic): 3.16e-14 +# Time: 12:03:43 Log-Likelihood: -1784.6 +# No. Observations: 1000 AIC: 3589. +# Df Residuals: 990 BIC: 3638. +# Df Model: 10 +# Covariance Type: cluster +# ============================================================================== +# coef std err z P>|z| [0.025 0.975] +# ------------------------------------------------------------------------------ +# x1 0.0128 0.064 0.200 0.841 -0.113 0.138 +wildboottest(model, "X1", cluster, B) +# 0.8408408408408409 + + +# execute, method by method + +# some preparations bootcluster = cluster R = np.zeros(k) R[0] = 1 -B = 99999 start_time = timeit.default_timer() wcr = wb.Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341) @@ -40,15 +74,12 @@ wcr.get_scores(bootstrap_type = "11", impose_null = True) wcr.get_weights(weights_type = "rademacher") wcr.get_numer() wcr.get_denom() -wcr.numer -wcr.denom wcr.get_tboot() -wcr.t_boot wcr.get_vcov() wcr.get_tstat() wcr.get_pvalue(pval_type = "two-tailed") print("estimation time:", timeit.default_timer() - start_time) # >>> 0.9225496 seconds print("p value:", wcr.pvalue) -# >>> p value: 0.15258152581525816 +# >>> p value: 0.8408408408408409 ``` diff --git a/tests/bootalgo-tests.py b/tests/bootalgo-tests.py index 0bcb2d5..e964fcb 100644 --- a/tests/bootalgo-tests.py +++ b/tests/bootalgo-tests.py @@ -17,15 +17,15 @@ def WCR11_not_WCU11(): R[0] = 1 B = 999 - wcr = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341) - wcr.get_scores(bootstrap_type = "11", impose_null = True) - wcr.get_weights(weights_type = "rademacher") - wcr.get_numer() - wcr.get_denom() - wcr.get_tboot() - wcr.get_vcov() - wcr.get_tstat() - wcr.get_pvalue(pval_type = "two-tailed") + boot = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341) + boot.get_scores(bootstrap_type = "11", impose_null = True) + boot.get_weights(weights_type = "rademacher") + boot.get_numer() + boot.get_denom() + boot.get_tboot() + boot.get_vcov() + boot.get_tstat() + boot.get_pvalue(pval_type = "two-tailed") wcu = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341) wcu.get_scores(bootstrap_type = "11", impose_null = False) @@ -38,10 +38,10 @@ def WCR11_not_WCU11(): wcu.get_pvalue(pval_type = "two-tailed") # score matrices of WCR11 and WCU11 should be different - currently not the case - assert not np.array_equal(wcr.scores_mat, wcu.scores_mat) - assert not np.array_equal(wcr.t_boot, wcu.t_boot) - assert np.array_equal(wcr.t_stat, wcu.t_stat) - assert not np.array_equal(wcr.pvalue, wcu.pvalue) # unless both pvals are zero or 1... + assert not np.array_equal(boot.scores_mat, wcu.scores_mat) + assert not np.array_equal(boot.t_boot, wcu.t_boot) + assert np.array_equal(boot.t_stat, wcu.t_stat) + assert not np.array_equal(boot.pvalue, wcu.pvalue) # unless both pvals are zero or 1... def test_r_vs_py(): @@ -74,14 +74,14 @@ def full_enum_works(): B = 99999 wcr = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341) - wcr.get_scores(bootstrap_type = "11", impose_null = False) - wcr.get_weights(weights_type = "rademacher") - wcr.get_numer() - wcr.get_denom() - wcr.get_tboot() + boot.get_scores(bootstrap_type = "11", impose_null = False) + boot.get_weights(weights_type = "rademacher") + boot.get_numer() + boot.get_denom() + boot.get_tboot() - assert len(wcr.t_boot) == 2**G - assert wcr.full_enumeration == True + assert len(boot.t_boot) == 2**G + assert boot.full_enumeration == True diff --git a/wildboottest/__pycache__/wildboottest.cpython-36.pyc b/wildboottest/__pycache__/wildboottest.cpython-36.pyc index 65e375553e864b4ba255b05e6d7808442764d2e9..e39e10036a478c4cdb434fefc73017c0e8e32066 100644 GIT binary patch delta 2812 zcmai0O>7%Q6!xrt#ldVv*Y}f zH5P^M9R~_K#0&A9(@U5r81!UJ+8Dx|l*fC95^tmIW=eb!q3Wij#W`= z24NbZzVqj#LGD{a`Vv5pa@LvI;X%A@ScQZ#5cpeD4bt>~n_8SZhl1x3UPoXE9HD`5 z#b2DhHYkvE5hU=_`__S+B9I~thzt?y{$JDA$%g;-OojCPZ)QGyVhe?@0tCbnZy<9E z3Mmz%7{guz*!C>9C>P&JAUNyCHqHk_|wmAE4ICBq8Z6jPmP`ye2%G`cU zZ2HdZ40+psZpJ{ zsqUa^)7>GNX${D%H@J=R!#eSXy}?#*5Y3(ZPnnRMH>&P}%@ctdd4HEgS7 z8Wv}wss>8up1Y8PS>V4U8tzFXC{;uGP`&s6`h}BpiSC(<_9u&+QdS)%SR1U0NM0CL z1HCo5)sRgHf)^=lQOOJoM`V*z$=#l$N|9sWAZ=Jca91jmQ9|;TQ({7}EoIs2w4E)g zcvR4hHn%YFJI)YUzzNrxFj&pxC1v_d9%cf2WW|5O_Km_1^j32Cvm})D|=s9^+go%r)lwwkIc;&fBcsB_=I7u%w?G9`Z%o&c*=9Xgv;~FllaSA#;P*NKOMfx7<4X21s zS+;yk#de!35xXKfpw;&zbp4i@ekMw4C;WG1OFg<|n)Zf^d8{{W@J2a%%S4#HySo5t z5D2V5i9TsS43Ni4C0C@m1%;J`Ft-Z}kXIbp>n&RCQI~UFt!as#b0qEuu;~bU-OxE4 zFu^R{ZY!y1WkBsbmmOj49*d>BoI(dO>l{;`oxbXmo2>1aJUUU?9%d5PPRPASrNkVA zGMBn-6?!BP?O>N$PFD59{to>ssVFMP7u3I?2AAm~t(41U#avd5BA>=?MO~zeJ+QTF zBiw-ba*qx2#@4QmQ>9Hkwlxm(`$&IxQE0pl&66cXR(Wj1!%5j<%WF>3YN)r=(2F+KoEx#7bIb`bFrG>n|5P0$zg9b zf^<|5f`rZ_3sRdFYjZb9ZK*c=W*@{U#sPw)Vd;ERB(MW+Exnj{7|FZeU2xO{D2%Np zRaSzes+?j7N2Ui%06WAb1AY2A0d2<@CkytUkwAbnt)W+q$pyqf@44hL-IWZeZG?=Z6WRd+pKJnr^c&K|O^}2u4qh}?#QT8Dg#-T2b5AYe vWlM|#1ZhiGMO+Epb{i4bQS1X0L+4C&6%aS!x;Evv=qQ=@uhK)>^wEC-bhhL4 delta 789 zcmZva-)qxQ6vuOuH0`>rmUOl4)>&DxG;_*U2P!ymV+g)DhbXj8v#jY#rAynJblo&c znGF1ah~Bq>FS36?iv9)SKOpQOKKkI}*o)8JZw6Mxg!{=km-9X6o^$hk`TczM!tikT zqIsj-IwuG}gufnz+t>KA_upps1LU@##XlZg4Uz>G56#K0pa*&@p-%x)>Zy@m5?Nvc z>QBxiHI}=}Wm=F<@(;ZO6oIoq3AhKWu)|oAEySKp-hvz9@MOD19SS*3z*ii7t>>JB(c+q{yb?U!xBYl>(B$T2DM#Bd02`b&itK+%)R#iYIN$1XAEKXbB0W{uJ+AB`RTtO+ z{M`VZhCKtsfCA8F-E@l2VlQosIMTxbH(*Mp%-`T^`HA8xz47t5cpe>U#FLG!n(dE9 z3wvs(RyTE%8XbMTZRuvM?08bO;_&gVX3nsenWfAKmkH6Z7$PA>CbBdkN63( Date: Mon, 17 Oct 2022 18:27:15 -0400 Subject: [PATCH 3/6] add test to bottom of file --- wildboottest/wildboottest.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/wildboottest/wildboottest.py b/wildboottest/wildboottest.py index fb5edec..e62bd99 100644 --- a/wildboottest/wildboottest.py +++ b/wildboottest/wildboottest.py @@ -412,6 +412,8 @@ def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_nu R = np.zeros(len(xnames)) R[xnames.index(param)] = 1 + # Just test for beta=0 + # R = np.identity(len(xnames)) # is it possible to fetch the clustering variables from the pre-processed data # frame, e.g. with 'excluding' observations with missings etc @@ -432,4 +434,33 @@ def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_nu return boot.pvalue +if __name__ == '__main__': + import statsmodels.api as sm + import numpy as np + np.random.seed(12312312) + N = 1000 + k = 10 + G= 10 + X = np.random.normal(0, 1, N * k).reshape((N,k)) + beta = np.random.normal(0,1,k) + beta[0] = 0.005 + u = np.random.normal(0,1,N) + Y = 1 + X @ beta + u + cluster = np.random.choice(list(range(0,G)), N) + + model = sm.OLS(Y, X) + + model.fit(cov_type='wildclusterbootstrap', + cov_kwds = {'cluster' : cluster, + 'B' : 9999, + 'weights_type' : 'rademacher', + 'impose_null' : True, + 'bootstrap_type' : '11', + 'seed' : None, + 'param' : 'X1'}).summary() + + + + # boottest(model, param = "X1", cluster = cluster, B = 9999) + From 36c6edf7b876491893972908ba3e9efe7c682691 Mon Sep 17 00:00:00 2001 From: Aleksandr Michuda Date: Tue, 18 Oct 2022 13:46:33 -0400 Subject: [PATCH 4/6] small change to wildboottest and gitignore --- .gitignore | 4 +++- wildboottest/wildboottest.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 571e4e6..7dd8363 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ .Rhistory .RData .Ruserdata -.Rproj \ No newline at end of file +.Rproj +__pycache__/ +*.egg-info/ \ No newline at end of file diff --git a/wildboottest/wildboottest.py b/wildboottest/wildboottest.py index e62bd99..1098192 100644 --- a/wildboottest/wildboottest.py +++ b/wildboottest/wildboottest.py @@ -49,8 +49,7 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None): self.N_G_bootcluster = len(bootclustid) self.G = len(clustid) - k = R.shape[0] - self.k = k + self.k = R.shape[0] self.B = B self.X = X self.R = R From 4bc72df021d12aa7301d624e91c74d73d186f5e2 Mon Sep 17 00:00:00 2001 From: Aleksandr Michuda Date: Tue, 18 Oct 2022 16:34:41 -0400 Subject: [PATCH 5/6] run boostrapping for all variables --- wildboottest/wildboottest.py | 70 +++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/wildboottest/wildboottest.py b/wildboottest/wildboottest.py index 1098192..e65ef21 100644 --- a/wildboottest/wildboottest.py +++ b/wildboottest/wildboottest.py @@ -60,8 +60,8 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None): y_list = [] tXgXg_list = [] tXgyg_list = [] - tXX = np.zeros((k, k)) - tXy = np.zeros(k) + tXX = np.zeros((self.k, self.k)) + tXy = np.zeros(self.k) #all_cluster = np.unique(bootcluster) @@ -354,7 +354,7 @@ def draw_weights(t : str, full_enumeration: bool, N_G_bootcluster: int, boot_ite return [v, boot_iter] -def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None): +def wildboottest(model, cluster, B, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None): ''' Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS' @@ -397,8 +397,6 @@ def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_nu ''' - # set param to lowercase? model.data.xnames all seem to be lowercase? - param = str.lower(param) # does model.exog already exclude missing values? X = model.exog # interestingly, the dependent variable is called 'endogeneous' @@ -409,29 +407,41 @@ def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_nu xnames = model.data.xnames ynames = model.data.ynames - R = np.zeros(len(xnames)) - R[xnames.index(param)] = 1 - # Just test for beta=0 - # R = np.identity(len(xnames)) + pvalues = [] + tstats = [] - # is it possible to fetch the clustering variables from the pre-processed data - # frame, e.g. with 'excluding' observations with missings etc - # cluster = ... + # hack to make a single param specified allow to be looped. + if isinstance(param, str): + param = [param] - # set bootcluster == cluster for one-way clustering - bootcluster = cluster - - boot = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = seed) - boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null) - boot.get_weights(weights_type = weights_type) - boot.get_numer() - boot.get_denom() - boot.get_tboot() - boot.get_vcov() - boot.get_tstat() - boot.get_pvalue(pval_type = "two-tailed") + for param in xnames: + R = np.zeros(len(xnames)) + R[xnames.index(param)] = 1 + # Just test for beta=0 + # R = np.identity(len(xnames)) + + # is it possible to fetch the clustering variables from the pre-processed data + # frame, e.g. with 'excluding' observations with missings etc + # cluster = ... + + # set bootcluster == cluster for one-way clustering + bootcluster = cluster + + boot = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, + R = R, B = B, seed = seed) + boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null) + boot.get_weights(weights_type = weights_type) + boot.get_numer() + boot.get_denom() + boot.get_tboot() + boot.get_vcov() + boot.get_tstat() + boot.get_pvalue(pval_type = "two-tailed") + + pvalues.append(boot.pvalue) + tstats.append(boot.t_stat) - return boot.pvalue + return np.array(pvalues), np.array(tstats).flatten() if __name__ == '__main__': import statsmodels.api as sm @@ -450,14 +460,18 @@ def wildboottest(model, param, cluster, B, weights_type = 'rademacher',impose_nu model = sm.OLS(Y, X) - model.fit(cov_type='wildclusterbootstrap', + print("--- WCB ---") + print(model.fit().summary()) + + print("--- NonRobust ---") + print(model.fit(cov_type='wildclusterbootstrap', cov_kwds = {'cluster' : cluster, 'B' : 9999, 'weights_type' : 'rademacher', 'impose_null' : True, 'bootstrap_type' : '11', - 'seed' : None, - 'param' : 'X1'}).summary() + 'seed' : None}).summary()) + From 5ad5410d4a1c349a9997041f957652ea92eef2bb Mon Sep 17 00:00:00 2001 From: Aleksandr Michuda Date: Tue, 18 Oct 2022 17:01:57 -0400 Subject: [PATCH 6/6] allow pandas dataframes as input --- wildboottest/wildboottest.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/wildboottest/wildboottest.py b/wildboottest/wildboottest.py index e65ef21..eab1759 100644 --- a/wildboottest/wildboottest.py +++ b/wildboottest/wildboottest.py @@ -354,7 +354,7 @@ def draw_weights(t : str, full_enumeration: bool, N_G_bootcluster: int, boot_ite return [v, boot_iter] -def wildboottest(model, cluster, B, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None): +def wildboottest(model, cluster, B, param = None, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None): ''' Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS' @@ -410,15 +410,11 @@ def wildboottest(model, cluster, B, weights_type = 'rademacher',impose_null = Tr pvalues = [] tstats = [] - # hack to make a single param specified allow to be looped. - if isinstance(param, str): - param = [param] - - for param in xnames: + def generate_stats(param): + R = np.zeros(len(xnames)) R[xnames.index(param)] = 1 # Just test for beta=0 - # R = np.identity(len(xnames)) # is it possible to fetch the clustering variables from the pre-processed data # frame, e.g. with 'excluding' observations with missings etc @@ -440,6 +436,16 @@ def wildboottest(model, cluster, B, weights_type = 'rademacher',impose_null = Tr pvalues.append(boot.pvalue) tstats.append(boot.t_stat) + + return pvalues, tstats + + if param is None: + for x in xnames: + pvalues, tstats = generate_stats(x) + elif isinstance(param, str): + pvalues, tstats = generate_stats(param) + else: + raise Exception("`param` not correctly specified") return np.array(pvalues), np.array(tstats).flatten() @@ -457,8 +463,12 @@ def wildboottest(model, cluster, B, weights_type = 'rademacher',impose_null = Tr u = np.random.normal(0,1,N) Y = 1 + X @ beta + u cluster = np.random.choice(list(range(0,G)), N) + + X_df = pd.DataFrame(data=X, columns = [f"col_{i}" for i in range(k)]) + + Y_df = pd.DataFrame(data=Y, columns = ['outcome']) - model = sm.OLS(Y, X) + model = sm.OLS(Y_df, X_df) print("--- WCB ---") print(model.fit().summary())