neurodata · PSSF23 · Nov 17, 2020 · Nov 4, 2020 · Nov 4, 2020 · Nov 4, 2020
diff --git a/docs/tutorials/functions/unc_forest_tutorials_functions.py b/docs/tutorials/functions/unc_forest_tutorials_functions.py
@@ -15,6 +15,9 @@
 from proglearn.forest import UncertaintyForest
 from proglearn.sims import generate_gaussian_parity
 
+from scipy.stats import entropy, norm
+from scipy.integrate import quad
+
 def generate_data(n, mean, var):
     '''
     Parameters
@@ -157,4 +160,220 @@ def plot_fig1(algos, num_plotted_trials, X_eval):
 
     fig.tight_layout()
     # plt.savefig("fig1.pdf")
+    plt.show()
+
+def generate_data_fig2(n, d, mu = 1):
+    n_1 = np.random.binomial(n, .5) # number of class 1
+    mean = np.zeros(d)
+    mean[0] = mu
+    X_1 = np.random.multivariate_normal(mean, np.eye(d), n_1)
+
+    X = np.concatenate((X_1, np.random.multivariate_normal(-mean, np.eye(d), n - n_1)))
+    y = np.concatenate((np.repeat(1, n_1), np.repeat(0, n - n_1)))
+
+    return X, y
+
+def cart_estimate(X, y, n_trees = 300, bootstrap = True):
+    model = RandomForestClassifier(bootstrap = bootstrap, n_estimators =n_trees)
+    model.fit(X, y)
+    class_counts = np.zeros((X.shape[0], model.n_classes_))
+    for tree_in_forest in model:
+        # get number of training elements in each partition
+        node_counts = tree_in_forest.tree_.n_node_samples
+        # get counts for all x (x.length array)
+        partition_counts = np.asarray([node_counts[x] for x in tree_in_forest.apply(X)])
+        # get class probability for all x (x.length, n_classes)
+        class_probs = tree_in_forest.predict_proba(X)
+        # get elements by performing row wise multiplication
+        elems = np.multiply(class_probs, partition_counts[:, np.newaxis])
+        # update counts for that tree
+        class_counts += elems
+    probs = class_counts/class_counts.sum(axis=1, keepdims=True)
+    entropies = -np.sum(np.log(probs)*probs, axis = 1)
+    # convert nan to 0
+    entropies = np.nan_to_num(entropies)
+    return np.mean(entropies)
+
+
+def true_cond_entropy(mu, base = np.exp(1)):
+    def func(x):
+        p = 0.5 * norm.pdf(x, mu, 1) + 0.5 * norm.pdf(x, -mu, 1)
+        return -p * np.log(p) / np.log(base)
+
+    H_X = quad(func, -20, 20)
+    H_XY = 0.5*(1.0 + np.log(2 * np.pi)) / np.log(base)
+    H_Y = np.log(2.0) / np.log(base)
+    # I_XY = H_X - H_XY = H_Y - H_YX
+    return H_Y - H_X[0] + H_XY
+
+
+def format_func(value, tick_number):
+    epsilon = 10 ** (-5)
+    if np.absolute(value) < epsilon:
+        return "0"
+    if np.absolute(value - 0.5) < epsilon:
+        return "0.5"
+    if np.absolute(value - 1) < epsilon:
+        return "1"
+    else:
+        return ""
+
+def estimate_ce(X, y, label):
+    if label == "CART":
+        return cart_estimate(X, y)
+    elif label == "IRF":
+        frac_eval = 0.3
+        irf = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators = 300), 
+                                     method='isotonic', 
+                                     cv = 5)
+        # X_train, y_train, X_eval, y_eval = split_train_eval(X, y, frac_eval)
+        X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=frac_eval)
+        irf.fit(X_train, y_train)
+        p = irf.predict_proba(X_eval)
+        return np.mean(entropy(p.T, base = np.exp(1)))
+    elif label == "UF":
+        frac_eval = 0.3
+        uf = UncertaintyForest(n_estimators = 300, tree_construction_proportion = 0.4, kappa = 3.0)
+        # X_train, y_train, X_eval, y_eval = split_train_eval(X, y, frac_eval)
+        X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=frac_eval)
+        uf.fit(X_train,y_train)
+        p = uf.predict_proba(X_eval)
+        return np.mean(entropy(p.T, base = np.exp(1)))
+    else:
+        raise ValueError("Unrecognized Label!")
+
+def get_cond_entropy_vs_n(mean, d, num_trials, sample_sizes, algos):
+
+    def worker(t):
+        X, y = generate_data_fig2(elem, d, mu = mean)
+
+        ret = []
+        for algo in algos:
+            ret.append(estimate_ce(X, y, algo['label']))
+
+        return tuple(ret)
+
+    output = np.zeros((len(algos), len(sample_sizes), num_trials))
+    for i, elem in enumerate(sample_sizes):
+        results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
+        for j in range(len(algos)):
+            output[j, i, :] = results[:, j]
+
+        # results = []
+        # for t in range(num_trials):
+        #     # print(t)
+        #     results.append(worker(t))
+        # results = np.array(results)
+        for j in range(len(algos)):
+            output[j, i, :] = results[:, j]
+
+    pickle.dump(sample_sizes, open('output/sample_sizes_d_%d.pkl' % d, 'wb'))
+    for j, algo in enumerate(algos):
+        pickle.dump(output[j], open('output/%s_by_n_d_%d.pkl' % (algo['label'], d), 'wb'))
+
+    return output
+
+def get_cond_entropy_vs_mu(n, d, num_trials, mus, algos):
+
+    def worker(t):
+        X, y = generate_data_fig2(n, d, mu = elem)
+
+        ret = []
+        for algo in algos:
+            ret.append(estimate_ce(X, y, algo['label']))
+
+        return tuple(ret)
+
+    output = np.zeros((len(algos), len(mus), num_trials))
+    for i, elem in enumerate(mus):
+        results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
+        for j in range(len(algos)):
+            output[j, i, :] = results[:, j]
+
+        # results = []
+        # for t in range(num_trials):
+        #     # print(t)
+        #     results.append(worker(t))
+        # results = np.array(results)
+        for j in range(len(algos)):
+            output[j, i, :] = results[:, j]
+
+
+    pickle.dump(mus, open('output/mus.pkl', 'wb'))
+    for j, algo in enumerate(algos):
+        pickle.dump(output[j], open('output/%s_by_mu_d_%d.pkl' % (algo['label'], d), 'wb'))
+
+    return output
+
+def plot_cond_entropy_by_n(ax, num_plotted_trials, d, mu, algos, panel):
+
+    sample_sizes = np.array(pickle.load(open('output/sample_sizes_d_%d.pkl' % d, 'rb')))
+    for j, algo in enumerate(algos):
+        result = pickle.load(open('output/%s_by_n_d_%d.pkl' % (algo['label'], d), 'rb'))
+        # Plot the mean over trials as a solid line.
+        ax.plot(sample_sizes,
+                np.mean(result, axis = 1).flatten(), 
+                label = algo['label'], 
+                linewidth = 4, 
+                color = algo['color'])
+        # Use transparent lines to show other trials.
+        for t in range(num_plotted_trials):
+            ax.plot(sample_sizes, 
+                    result[:, t].flatten(),  
+                    linewidth = 2, 
+                    color = algo['color'],
+                    alpha = 0.15)
+
+    truth = true_cond_entropy(mu)
+    ax.axhline(y = truth, linestyle = '-', color = "black", label = "Truth")
+
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(format_func))
+    ax.set_xlabel("Sample Size")
+    ax.set_ylabel("Estimated Conditional Entropy")
+    ax.set_title("%s) Effect Size = %.1f" % (panel, mu))
+    ax.set_ylim(ymin = -0.05, ymax = 1.05)
+
+def plot_cond_entropy_by_mu(ax, d, n, algos, panel):
+
+    mus = pickle.load(open('output/mus.pkl', 'rb'))
+    for j, algo in enumerate(algos):
+        result = pickle.load(open('output/%s_by_mu_d_%d.pkl' % (algo['label'], d), 'rb'))
+        # Plot the mean over trials as a solid line.
+        ax.plot(mus,
+                np.mean(result, axis = 1).flatten(), 
+                label = algo['label'], 
+                linewidth = 4, 
+                color = algo['color'])
+
+    truth = [true_cond_entropy(mu) for mu in mus]
+    ax.plot(mus, truth, label = 'Truth', linewidth = 4, color = 'black')
+
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(format_func))
+    ax.set_ylim(ymin = -.05)
+    ax.set_title("%s) n = %d" % (panel, n))
+    ax.set_xlabel("Effect Size")
+    ax.set_ylabel("Estimated Conditional Entropy")
+
+
+def plot_fig2(num_plotted_trials, d1, d2, n1, n2, effect_size, algos):
+    sns.set(font_scale = 3)
+    sns.set_style("ticks")
+    plt.rcParams["font.family"] = "sans-serif"
+    plt.rcParams['figure.figsize'] = [30, 20]
+    fig, axes = plt.subplots(2, 2)
+
+    plot_cond_entropy_by_n(axes[0, 0], num_plotted_trials, d1, effect_size, algos, "A")
+    plot_cond_entropy_by_mu(axes[0, 1], d1, n1, algos, "B")
+
+    plot_cond_entropy_by_n(axes[1, 0], num_plotted_trials, d2, effect_size, algos, "C") 
+    plot_cond_entropy_by_mu(axes[1, 1], d2, n2, algos, "D")
+
+    axes[0,0].legend(loc = "upper left")
+
+    fig.text(-0.05, 0.27, 'd = %d' % d2, ha='left', va='center', fontsize = 40)
+    fig.text(-0.05, 0.77, 'd = %d' % d1, ha='left', va='center', fontsize = 40)
+
+    plt.subplots_adjust(left=-1)
+    plt.tight_layout()
+    # plt.savefig("fig2.pdf", bbox_inches = "tight")
     plt.show()
diff --git a/docs/tutorials/output/CART_by_mu_d_1.pkl b/docs/tutorials/output/CART_by_mu_d_1.pkl
diff --git a/docs/tutorials/output/CART_by_mu_d_3.pkl b/docs/tutorials/output/CART_by_mu_d_3.pkl
diff --git a/docs/tutorials/output/CART_by_n_d_1.pkl b/docs/tutorials/output/CART_by_n_d_1.pkl
diff --git a/docs/tutorials/output/CART_by_n_d_3.pkl b/docs/tutorials/output/CART_by_n_d_3.pkl
diff --git a/docs/tutorials/output/IRF_by_mu_d_1.pkl b/docs/tutorials/output/IRF_by_mu_d_1.pkl
diff --git a/docs/tutorials/output/IRF_by_mu_d_3.pkl b/docs/tutorials/output/IRF_by_mu_d_3.pkl
diff --git a/docs/tutorials/output/IRF_by_n_d_1.pkl b/docs/tutorials/output/IRF_by_n_d_1.pkl
diff --git a/docs/tutorials/output/IRF_by_n_d_3.pkl b/docs/tutorials/output/IRF_by_n_d_3.pkl
diff --git a/docs/tutorials/output/UF_by_mu_d_1.pkl b/docs/tutorials/output/UF_by_mu_d_1.pkl
diff --git a/docs/tutorials/output/UF_by_mu_d_3.pkl b/docs/tutorials/output/UF_by_mu_d_3.pkl
diff --git a/docs/tutorials/output/UF_by_n_d_1.pkl b/docs/tutorials/output/UF_by_n_d_1.pkl
diff --git a/docs/tutorials/output/UF_by_n_d_3.pkl b/docs/tutorials/output/UF_by_n_d_3.pkl
diff --git a/docs/tutorials/output/mus.pkl b/docs/tutorials/output/mus.pkl
diff --git a/docs/tutorials/output/sample_sizes_d_1.pkl b/docs/tutorials/output/sample_sizes_d_1.pkl
diff --git a/docs/tutorials/output/sample_sizes_d_3.pkl b/docs/tutorials/output/sample_sizes_d_3.pkl
diff --git a/docs/tutorials/uncertaintyforest_fig1.ipynb b/docs/tutorials/uncertaintyforest_fig1.ipynb
@@ -68,7 +68,7 @@
     "# The following are two sets of parameters.\n",
     "# The first are those that were actually used to produce figure 1.\n",
     "# These take a long time to actually run since there are 6000 data points.\n",
-    "# Below those, you'll find some testing parameters so that you can see the results quicker.\n",
+    "# Below those, you'll find some testing parameters so that you can see the results more quickly.\n",
     "\n",
     "# Here are the \"Real Parameters\"\n",
     "#n = 6000\n",
@@ -197,4 +197,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}