py-econometrics · amichuda · Oct 19, 2022 · Oct 14, 2022 · Oct 15, 2022 · Oct 17, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@
 .Rhistory
 .RData
 .Ruserdata
-.Rproj
+.Rproj
+__pycache__/
+*.egg-info/
diff --git a/readme.md b/readme.md
@@ -12,43 +12,74 @@ If you'd like to cooperate, either send us an
 
 ## Example 
 
-Note: everything is still very much work in progress, and there are multiple errors in the code that I am aware of. Still, I believe that the implementation of the WCR11 is more or less correct.
+Note: everything is still very much work in progress. Still, I believe that the implementation of the WCR11 and WCU 11 is more or less correct.
 
 ```
-import wildboottest.wildboottest as wb
+from wildboottest.wildboottest import wildboottest, Wildboottest
+import statsmodels.api as sm
 import numpy as np
 import timeit 
 import time
 
-N = 100000
-k = 100
-G= 50
+N = 1000
+k = 10
+G= 12
 X = np.random.normal(0, 1, N * k).reshape((N,k))
 beta = np.random.normal(0,1,k)
 beta[0] = 0.005
 u = np.random.normal(0,1,N)
 Y = 1 + X @ beta + u
 cluster = np.random.choice(list(range(0,G)), N)
+B = 99999
+
+
+model = sm.OLS(Y, X)
+model.exog
+results = model.fit(cov_type = 'cluster', cov_kwds = {
+   'groups': cluster
+})
+results.summary()
+# >>> results.summary()
+# <class 'statsmodels.iolib.summary.Summary'>
+# """
+#                                  OLS Regression Results                                
+# =======================================================================================
+# Dep. Variable:                      y   R-squared (uncentered):                   0.799
+# Model:                            OLS   Adj. R-squared (uncentered):              0.797
+# Method:                 Least Squares   F-statistic:                              790.3
+# Date:                Sat, 15 Oct 2022   Prob (F-statistic):                    3.16e-14
+# Time:                        12:03:43   Log-Likelihood:                         -1784.6
+# No. Observations:                1000   AIC:                                      3589.
+# Df Residuals:                     990   BIC:                                      3638.
+# Df Model:                          10                                                  
+# Covariance Type:              cluster                                                  
+# ==============================================================================
+#                  coef    std err          z      P>|z|      [0.025      0.975]
+# ------------------------------------------------------------------------------
+# x1             0.0128      0.064      0.200      0.841      -0.113       0.138
+wildboottest(model, "X1", cluster, B)
+# 0.8408408408408409
+
+
+# execute, method by method
+
+# some preparations
 bootcluster = cluster
 R = np.zeros(k)
 R[0] = 1
-B = 99999
 
 start_time = timeit.default_timer()
 wcr = wb.Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
 wcr.get_scores(bootstrap_type = "11", impose_null = True)
 wcr.get_weights(weights_type = "rademacher")
 wcr.get_numer()
 wcr.get_denom()
-wcr.numer
-wcr.denom
 wcr.get_tboot()
-wcr.t_boot
 wcr.get_vcov()
 wcr.get_tstat()
 wcr.get_pvalue(pval_type = "two-tailed")
 print("estimation time:", timeit.default_timer() - start_time)
 # >>> 0.9225496 seconds
 print("p value:", wcr.pvalue)
-# >>> p value: 0.15258152581525816
+# >>> p value: 0.8408408408408409
 ```
diff --git a/tests/bootalgo-tests.py b/tests/bootalgo-tests.py
@@ -8,7 +8,7 @@ def WCR11_not_WCU11():
   G= 20
   X = np.random.normal(0, 1, N * k).reshape((N,k))
   beta = np.random.normal(0,1,k)
-  beta[0] = 1
+  beta[0] = 0.1
   u = np.random.normal(0,1,N)
   y = 1 + X @ beta + u
   cluster = np.random.choice(list(range(0,G)), N)
@@ -17,37 +17,31 @@ def WCR11_not_WCU11():
   R[0] = 1
   B = 999
 
-  wcr = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
-  wcr.get_scores(bootstrap_type = "11", impose_null = True)
-  wcr.get_weights(weights_type = "rademacher")
-  wcr.get_numer()
-  wcr.get_denom()
-  wcr.numer
-  wcr.denom
-  wcr.get_tboot()
-  wcr.t_boot
-  wcr.get_vcov()
-  wcr.get_tstat()
-  wcr.get_pvalue(pval_type = "two-tailed")
+  boot = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
+  boot.get_scores(bootstrap_type = "11", impose_null = True)
+  boot.get_weights(weights_type = "rademacher")
+  boot.get_numer()
+  boot.get_denom()
+  boot.get_tboot()
+  boot.get_vcov()
+  boot.get_tstat()
+  boot.get_pvalue(pval_type = "two-tailed")
 
-  wcu = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
-  wcu.get_scores(bootstrap_type = "11", impose_null = True)
-  wcr.get_weights(weights_type = "rademacher")
+  wcu = wb.Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = 99999, seed = 12341)
+  wcu.get_scores(bootstrap_type = "11", impose_null = False)
+  wcu.get_weights(weights_type = "rademacher")
   wcu.get_numer()
   wcu.get_denom()
-  wcu.numer
-  wcu.denom
   wcu.get_tboot()
-  wcu.t_boot
   wcu.get_vcov()
   wcu.get_tstat()
-  wcu-(pval_type = "two-tailed")
+  wcu.get_pvalue(pval_type = "two-tailed")
 
   # score matrices of WCR11 and WCU11 should be different - currently not the case
-  assert not np.array_equal(wcr.scores_mat, wcu.scores_mat)
-  assert not np.array_equal(wcr.t_boot, wcu.t_boot)
-  assert np.array_equal(wcr.t_stat, wcu.t_stat)
-  assert not np.array_equal(wcr.pvalue, wcu.pvalue) # unless both pvals are zero or 1...
+  assert not np.array_equal(boot.scores_mat, wcu.scores_mat)
+  assert not np.array_equal(boot.t_boot, wcu.t_boot)
+  assert np.array_equal(boot.t_stat, wcu.t_stat)
+  assert not np.array_equal(boot.pvalue, wcu.pvalue) # unless both pvals are zero or 1...
 
 
 def test_r_vs_py():
@@ -80,14 +74,14 @@ def full_enum_works():
   B = 99999
 
   wcr = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
-  wcr.get_scores(bootstrap_type = "11", impose_null = False)
-  wcr.get_weights(weights_type = "rademacher")
-  wcr.get_numer()
-  wcr.get_denom()
-  wcr.get_tboot()
+  boot.get_scores(bootstrap_type = "11", impose_null = False)
+  boot.get_weights(weights_type = "rademacher")
+  boot.get_numer()
+  boot.get_denom()
+  boot.get_tboot()
 
-  assert len(wcr.t_boot) == 2**G
-  assert wcr.full_enumeration == True
+  assert len(boot.t_boot) == 2**G
+  assert boot.full_enumeration == True
 
 
 

diff --git a/wildboottest/__pycache__/wildboottest.cpython-36.pyc b/wildboottest/__pycache__/wildboottest.cpython-36.pyc
diff --git a/wildboottest/wildboottest.py b/wildboottest/wildboottest.py
@@ -49,8 +49,7 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
       self.N_G_bootcluster = len(bootclustid)
       self.G  = len(clustid)
 
-      k = R.shape[0]
-      self.k = k 
+      self.k = R.shape[0]
       self.B = B
       self.X = X
       self.R = R
@@ -61,8 +60,8 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
       y_list = []
       tXgXg_list = []
       tXgyg_list = []
-      tXX = np.zeros((k, k))
-      tXy = np.zeros(k)
+      tXX = np.zeros((self.k, self.k))
+      tXy = np.zeros(self.k)
 
       #all_cluster = np.unique(bootcluster)
 
@@ -355,4 +354,136 @@ def draw_weights(t : str, full_enumeration: bool, N_G_bootcluster: int, boot_ite
     return [v, boot_iter]
 
 
+def wildboottest(model, cluster, B, param = None, weights_type = 'rademacher',impose_null = True, bootstrap_type = '11', seed = None):
+
+  '''
+  Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'
+
+  Args: 
+    model(statsmodels.regression.linear_model.OLS'): A statsmodels regression object
+    param(str): A string of length one, containing the test parameter of interest
+    cluster(np.array): A numpy array of dimension one, containing the clustering variable.
+    B(int): The number of bootstrap iterations to run
+    weights_type(str): The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'. 
+                       'rademacher' by default.
+    impose_null(logical): Should the null hypothesis be imposed on the bootstrap dgp, or not?
+                          True by default. 
+    bootstrap_type(str). A string of length one. Allows to choose the bootstrap type 
+                         to be run. Either '11', '31', '13' or '33'. '11' by default.
+    seed(int). Option to provide a random seed. 
+
+  Returns: 
+    A wild cluster bootstrapped p-value. 
+
+  Example: 
+
+    import statsmodels.api as sm
+    import numpy as np
+
+    np.random.seed(12312312)
+    N = 1000
+    k = 10
+    G= 10
+    X = np.random.normal(0, 1, N * k).reshape((N,k))
+    beta = np.random.normal(0,1,k)
+    beta[0] = 0.005
+    u = np.random.normal(0,1,N)
+    Y = 1 + X @ beta + u
+    cluster = np.random.choice(list(range(0,G)), N)
+
+    model = sm.OLS(Y, X)
+
+    boottest(model, param = "X1", cluster = cluster, B = 9999)
+
+  '''
+
+  # does model.exog already exclude missing values?
+  X = model.exog
+  # interestingly, the dependent variable is called 'endogeneous'
+  Y = model.endog
+  # weights not yet used, only as a placeholder
+  weights = model.weights
+
+  xnames = model.data.xnames
+  ynames = model.data.ynames
+
+  pvalues = []
+  tstats = []
+
+  def generate_stats(param):
+
+    R = np.zeros(len(xnames))
+    R[xnames.index(param)] = 1
+    # Just test for beta=0
+
+    # is it possible to fetch the clustering variables from the pre-processed data 
+    # frame, e.g. with 'excluding' observations with missings etc
+    # cluster = ...
+
+    # set bootcluster == cluster for one-way clustering
+    bootcluster = cluster
+
+    boot = Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, 
+                        R = R, B = B, seed = seed)
+    boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null)
+    boot.get_weights(weights_type = weights_type)
+    boot.get_numer()
+    boot.get_denom()
+    boot.get_tboot()
+    boot.get_vcov()
+    boot.get_tstat()
+    boot.get_pvalue(pval_type = "two-tailed")
+
+    pvalues.append(boot.pvalue)
+    tstats.append(boot.t_stat)
+
+    return pvalues, tstats
+
+  if param is None:
+    for x in xnames:
+      pvalues, tstats = generate_stats(x)
+  elif isinstance(param, str):
+    pvalues, tstats = generate_stats(param)
+  else:
+    raise Exception("`param` not correctly specified")
+
+  return np.array(pvalues), np.array(tstats).flatten()
+
+if __name__ == '__main__':
+    import statsmodels.api as sm
+    import numpy as np
 
+    np.random.seed(12312312)
+    N = 1000
+    k = 10
+    G= 10
+    X = np.random.normal(0, 1, N * k).reshape((N,k))
+    beta = np.random.normal(0,1,k)
+    beta[0] = 0.005
+    u = np.random.normal(0,1,N)
+    Y = 1 + X @ beta + u
+    cluster = np.random.choice(list(range(0,G)), N)
+
+    X_df = pd.DataFrame(data=X, columns = [f"col_{i}" for i in range(k)])
+
+    Y_df = pd.DataFrame(data=Y, columns = ['outcome'])
+
+    model = sm.OLS(Y_df, X_df)
+
+    print("--- WCB ---")
+    print(model.fit().summary())
+
+    print("--- NonRobust ---")
+    print(model.fit(cov_type='wildclusterbootstrap',
+              cov_kwds = {'cluster' : cluster,
+                          'B' : 9999,
+                          'weights_type' : 'rademacher',
+                          'impose_null' : True, 
+                          'bootstrap_type' : '11', 
+                          'seed' : None}).summary())
+
+
+
+
+    # boottest(model, param = "X1", cluster = cluster, B = 9999)
+