Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize UncertaintyForest() notebooks and add Figure 2 tutorial #365

Merged
merged 24 commits into from
Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
705c952
fig 2 tutorial
EYezerets Nov 4, 2020
32a76f2
call generate_data_fig2
EYezerets Nov 4, 2020
5be0dd2
move estimate_ce to functions
EYezerets Nov 4, 2020
0e10760
UncertaintyForest n_estimators = 300
EYezerets Nov 4, 2020
7b2e95a
tutorial format for netlify?
EYezerets Nov 5, 2020
e2d4467
Changed metadata after locally overwritten
EYezerets Nov 5, 2020
6b91aff
cart_estimate lowercase
EYezerets Nov 5, 2020
d4dd664
circumvent Parallel for debugging
EYezerets Nov 5, 2020
b047509
Merge pull request #8 from neurodata/staging
EYezerets Nov 11, 2020
ab7320d
Merge pull request #9 from EYezerets/staging
EYezerets Nov 11, 2020
8e2ad15
figure 2 - moved functions back into notebook for testing, encountere…
EYezerets Nov 11, 2020
2036c1b
self.lf_ linked in forest.py, key error in fig2tutorial notebook
EYezerets Nov 11, 2020
e2bd528
runs figure 2 tutorial, functions defined in notebook, not parallel
EYezerets Nov 12, 2020
a2a5a11
runs figure 2 tutorial, functions in unc_forest_tutorials_fucntions.p…
EYezerets Nov 12, 2020
10870f2
test parallel
EYezerets Nov 13, 2020
2a6090b
fig2tutorial run on AWS, parallel
EYezerets Nov 13, 2020
7e06bde
no pkl output
EYezerets Nov 16, 2020
e3ae0fa
with Parallel, on AWS
EYezerets Nov 16, 2020
82e4a01
tutorials.rst updated, text updated in uf tutorials, need to rerun fi…
EYezerets Nov 16, 2020
b48af2b
correct notebook formatting error in defining algos in fig2 tutorial
EYezerets Nov 16, 2020
2ef83f9
Merge branch 'staging' into fig2tutorial
PSSF23 Nov 16, 2020
81096b8
fig 2 tutorial run
EYezerets Nov 16, 2020
ed0dfd3
Merge branch 'fig2tutorial' of https://github.com/EYezerets/ProgLearn…
EYezerets Nov 16, 2020
17ff5ac
remove extra cell in uncertaintyforest_running_example tutorial
EYezerets Nov 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 219 additions & 0 deletions docs/tutorials/functions/unc_forest_tutorials_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
from proglearn.forest import UncertaintyForest
from proglearn.sims import generate_gaussian_parity

from scipy.stats import entropy, norm
from scipy.integrate import quad

def generate_data(n, mean, var):
'''
Parameters
Expand Down Expand Up @@ -157,4 +160,220 @@ def plot_fig1(algos, num_plotted_trials, X_eval):

fig.tight_layout()
# plt.savefig("fig1.pdf")
plt.show()

def generate_data_fig2(n, d, mu = 1):
n_1 = np.random.binomial(n, .5) # number of class 1
mean = np.zeros(d)
mean[0] = mu
X_1 = np.random.multivariate_normal(mean, np.eye(d), n_1)

X = np.concatenate((X_1, np.random.multivariate_normal(-mean, np.eye(d), n - n_1)))
y = np.concatenate((np.repeat(1, n_1), np.repeat(0, n - n_1)))

return X, y

def cart_estimate(X, y, n_trees = 300, bootstrap = True):
model = RandomForestClassifier(bootstrap = bootstrap, n_estimators =n_trees)
model.fit(X, y)
class_counts = np.zeros((X.shape[0], model.n_classes_))
for tree_in_forest in model:
# get number of training elements in each partition
node_counts = tree_in_forest.tree_.n_node_samples
# get counts for all x (x.length array)
partition_counts = np.asarray([node_counts[x] for x in tree_in_forest.apply(X)])
# get class probability for all x (x.length, n_classes)
class_probs = tree_in_forest.predict_proba(X)
# get elements by performing row wise multiplication
elems = np.multiply(class_probs, partition_counts[:, np.newaxis])
# update counts for that tree
class_counts += elems
probs = class_counts/class_counts.sum(axis=1, keepdims=True)
entropies = -np.sum(np.log(probs)*probs, axis = 1)
# convert nan to 0
entropies = np.nan_to_num(entropies)
return np.mean(entropies)


def true_cond_entropy(mu, base = np.exp(1)):
def func(x):
p = 0.5 * norm.pdf(x, mu, 1) + 0.5 * norm.pdf(x, -mu, 1)
return -p * np.log(p) / np.log(base)

H_X = quad(func, -20, 20)
H_XY = 0.5*(1.0 + np.log(2 * np.pi)) / np.log(base)
H_Y = np.log(2.0) / np.log(base)
# I_XY = H_X - H_XY = H_Y - H_YX
return H_Y - H_X[0] + H_XY


def format_func(value, tick_number):
epsilon = 10 ** (-5)
if np.absolute(value) < epsilon:
return "0"
if np.absolute(value - 0.5) < epsilon:
return "0.5"
if np.absolute(value - 1) < epsilon:
return "1"
else:
return ""

def estimate_ce(X, y, label):
if label == "CART":
return cart_estimate(X, y)
elif label == "IRF":
frac_eval = 0.3
irf = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators = 300),
method='isotonic',
cv = 5)
# X_train, y_train, X_eval, y_eval = split_train_eval(X, y, frac_eval)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=frac_eval)
irf.fit(X_train, y_train)
p = irf.predict_proba(X_eval)
return np.mean(entropy(p.T, base = np.exp(1)))
elif label == "UF":
frac_eval = 0.3
uf = UncertaintyForest(n_estimators = 300, tree_construction_proportion = 0.4, kappa = 3.0)
# X_train, y_train, X_eval, y_eval = split_train_eval(X, y, frac_eval)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=frac_eval)
uf.fit(X_train,y_train)
p = uf.predict_proba(X_eval)
return np.mean(entropy(p.T, base = np.exp(1)))
else:
raise ValueError("Unrecognized Label!")

def get_cond_entropy_vs_n(mean, d, num_trials, sample_sizes, algos):

def worker(t):
X, y = generate_data_fig2(elem, d, mu = mean)

ret = []
for algo in algos:
ret.append(estimate_ce(X, y, algo['label']))

return tuple(ret)

output = np.zeros((len(algos), len(sample_sizes), num_trials))
for i, elem in enumerate(sample_sizes):
results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
for j in range(len(algos)):
output[j, i, :] = results[:, j]

# results = []
# for t in range(num_trials):
# # print(t)
# results.append(worker(t))
# results = np.array(results)
for j in range(len(algos)):
output[j, i, :] = results[:, j]

pickle.dump(sample_sizes, open('output/sample_sizes_d_%d.pkl' % d, 'wb'))
for j, algo in enumerate(algos):
pickle.dump(output[j], open('output/%s_by_n_d_%d.pkl' % (algo['label'], d), 'wb'))

return output

def get_cond_entropy_vs_mu(n, d, num_trials, mus, algos):

def worker(t):
X, y = generate_data_fig2(n, d, mu = elem)

ret = []
for algo in algos:
ret.append(estimate_ce(X, y, algo['label']))

return tuple(ret)

output = np.zeros((len(algos), len(mus), num_trials))
for i, elem in enumerate(mus):
results = np.array(Parallel(n_jobs=-2)(delayed(worker)(t) for t in range(num_trials)))
for j in range(len(algos)):
output[j, i, :] = results[:, j]

# results = []
# for t in range(num_trials):
# # print(t)
# results.append(worker(t))
# results = np.array(results)
for j in range(len(algos)):
output[j, i, :] = results[:, j]


pickle.dump(mus, open('output/mus.pkl', 'wb'))
for j, algo in enumerate(algos):
pickle.dump(output[j], open('output/%s_by_mu_d_%d.pkl' % (algo['label'], d), 'wb'))

return output

def plot_cond_entropy_by_n(ax, num_plotted_trials, d, mu, algos, panel):

sample_sizes = np.array(pickle.load(open('output/sample_sizes_d_%d.pkl' % d, 'rb')))
for j, algo in enumerate(algos):
result = pickle.load(open('output/%s_by_n_d_%d.pkl' % (algo['label'], d), 'rb'))
# Plot the mean over trials as a solid line.
ax.plot(sample_sizes,
np.mean(result, axis = 1).flatten(),
label = algo['label'],
linewidth = 4,
color = algo['color'])
# Use transparent lines to show other trials.
for t in range(num_plotted_trials):
ax.plot(sample_sizes,
result[:, t].flatten(),
linewidth = 2,
color = algo['color'],
alpha = 0.15)

truth = true_cond_entropy(mu)
ax.axhline(y = truth, linestyle = '-', color = "black", label = "Truth")

ax.yaxis.set_major_formatter(plt.FuncFormatter(format_func))
ax.set_xlabel("Sample Size")
ax.set_ylabel("Estimated Conditional Entropy")
ax.set_title("%s) Effect Size = %.1f" % (panel, mu))
ax.set_ylim(ymin = -0.05, ymax = 1.05)

def plot_cond_entropy_by_mu(ax, d, n, algos, panel):

mus = pickle.load(open('output/mus.pkl', 'rb'))
for j, algo in enumerate(algos):
result = pickle.load(open('output/%s_by_mu_d_%d.pkl' % (algo['label'], d), 'rb'))
# Plot the mean over trials as a solid line.
ax.plot(mus,
np.mean(result, axis = 1).flatten(),
label = algo['label'],
linewidth = 4,
color = algo['color'])

truth = [true_cond_entropy(mu) for mu in mus]
ax.plot(mus, truth, label = 'Truth', linewidth = 4, color = 'black')

ax.yaxis.set_major_formatter(plt.FuncFormatter(format_func))
ax.set_ylim(ymin = -.05)
ax.set_title("%s) n = %d" % (panel, n))
ax.set_xlabel("Effect Size")
ax.set_ylabel("Estimated Conditional Entropy")


def plot_fig2(num_plotted_trials, d1, d2, n1, n2, effect_size, algos):
sns.set(font_scale = 3)
sns.set_style("ticks")
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams['figure.figsize'] = [30, 20]
fig, axes = plt.subplots(2, 2)

plot_cond_entropy_by_n(axes[0, 0], num_plotted_trials, d1, effect_size, algos, "A")
plot_cond_entropy_by_mu(axes[0, 1], d1, n1, algos, "B")

plot_cond_entropy_by_n(axes[1, 0], num_plotted_trials, d2, effect_size, algos, "C")
plot_cond_entropy_by_mu(axes[1, 1], d2, n2, algos, "D")

axes[0,0].legend(loc = "upper left")

fig.text(-0.05, 0.27, 'd = %d' % d2, ha='left', va='center', fontsize = 40)
fig.text(-0.05, 0.77, 'd = %d' % d1, ha='left', va='center', fontsize = 40)

plt.subplots_adjust(left=-1)
plt.tight_layout()
# plt.savefig("fig2.pdf", bbox_inches = "tight")
plt.show()
Binary file added docs/tutorials/output/CART_by_mu_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/CART_by_mu_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/CART_by_n_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/CART_by_n_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/IRF_by_mu_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/IRF_by_mu_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/IRF_by_n_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/IRF_by_n_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/UF_by_mu_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/UF_by_mu_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/UF_by_n_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/UF_by_n_d_3.pkl
Binary file not shown.
Binary file added docs/tutorials/output/mus.pkl
Binary file not shown.
Binary file added docs/tutorials/output/sample_sizes_d_1.pkl
Binary file not shown.
Binary file added docs/tutorials/output/sample_sizes_d_3.pkl
Binary file not shown.
4 changes: 2 additions & 2 deletions docs/tutorials/uncertaintyforest_fig1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"# The following are two sets of parameters.\n",
"# The first are those that were actually used to produce figure 1.\n",
"# These take a long time to actually run since there are 6000 data points.\n",
"# Below those, you'll find some testing parameters so that you can see the results quicker.\n",
"# Below those, you'll find some testing parameters so that you can see the results more quickly.\n",
"\n",
"# Here are the \"Real Parameters\"\n",
"#n = 6000\n",
Expand Down Expand Up @@ -197,4 +197,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading