forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomparing_explore_exploit_methods.py
119 lines (94 loc) · 2.96 KB
/
comparing_explore_exploit_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import matplotlib.pyplot as plt
from comparing_epsilons import Bandit
from optimistic_initial_values import run_experiment as run_experiment_oiv
from ucb1 import run_experiment as run_experiment_ucb
class BayesianBandit:
def __init__(self, true_mean):
self.true_mean = true_mean
# parameters for mu - prior is N(0,1)
self.predicted_mean = 0
self.lambda_ = 1
self.sum_x = 0 # for convenience
self.tau = 1
def pull(self):
return np.random.randn() + self.true_mean
def sample(self):
return np.random.randn() / np.sqrt(self.lambda_) + self.predicted_mean
def update(self, x):
self.lambda_ += self.tau
self.sum_x += x
self.predicted_mean = self.tau*self.sum_x / self.lambda_
def run_experiment_decaying_epsilon(m1, m2, m3, N):
bandits = [Bandit(m1), Bandit(m2), Bandit(m3)]
data = np.empty(N)
for i in range(N):
# epsilon greedy
p = np.random.random()
if p < 1.0/(i+1):
j = np.random.choice(3)
else:
j = np.argmax([b.mean for b in bandits])
x = bandits[j].pull()
bandits[j].update(x)
# for the plot
data[i] = x
cumulative_average = np.cumsum(data) / (np.arange(N) + 1)
# plot moving average ctr
plt.plot(cumulative_average)
plt.plot(np.ones(N)*m1)
plt.plot(np.ones(N)*m2)
plt.plot(np.ones(N)*m3)
plt.xscale('log')
plt.show()
for b in bandits:
print(b.mean)
return cumulative_average
def run_experiment(m1, m2, m3, N):
bandits = [BayesianBandit(m1), BayesianBandit(m2), BayesianBandit(m3)]
data = np.empty(N)
for i in range(N):
# optimistic initial values
j = np.argmax([b.sample() for b in bandits])
x = bandits[j].pull()
bandits[j].update(x)
# for the plot
data[i] = x
cumulative_average = np.cumsum(data) / (np.arange(N) + 1)
# plot moving average ctr
plt.plot(cumulative_average)
plt.plot(np.ones(N)*m1)
plt.plot(np.ones(N)*m2)
plt.plot(np.ones(N)*m3)
plt.xscale('log')
plt.show()
return cumulative_average
if __name__ == '__main__':
m1 = 1.0
m2 = 2.0
m3 = 3.0
eps = run_experiment_decaying_epsilon(m1, m2, m3, 100000)
oiv = run_experiment_oiv(m1, m2, m3, 100000)
ucb = run_experiment_ucb(m1, m2, m3, 100000)
bayes = run_experiment(m1, m2, m3, 100000)
# log scale plot
plt.plot(eps, label='decaying-epsilon-greedy')
plt.plot(oiv, label='optimistic')
plt.plot(ucb, label='ucb1')
plt.plot(bayes, label='bayesian')
plt.legend()
plt.xscale('log')
plt.show()
# linear plot
plt.plot(eps, label='decaying-epsilon-greedy')
plt.plot(oiv, label='optimistic')
plt.plot(ucb, label='ucb1')
plt.plot(bayes, label='bayesian')
plt.legend()
plt.show()