-
Notifications
You must be signed in to change notification settings - Fork 523
/
Copy pathucb.py
161 lines (139 loc) · 5.86 KB
/
ucb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
# pyre-unsafe
import math
from abc import ABC, abstractmethod
from typing import List, Optional
import torch
from reagent.mab.mab_algorithm import MABAlgo
from torch import Tensor
class BaseUCB(MABAlgo, ABC):
"""
Base class for UCB-like Multi-Armed Bandits (MAB)
Args:
estimate_variance: If True, per-arm reward variance is estimated and we multiply thconfidence interval width
by its square root
min_variance: The lower bound applied to the estimated variance. If variance is not estimated, this value is used instead of an estimate.
alpha: Scalar multiplier for confidence interval width. Values above 1.0 make exploration more aggressive, below 1.0 less aggressive
"""
def __init__(
self,
randomize_ties: bool = True,
estimate_variance: bool = True,
min_variance: float = 0.0,
alpha: float = 1.0,
min_num_obs_per_arm: int = 1,
*,
n_arms: Optional[int] = None,
arm_ids: Optional[List[str]] = None,
) -> None:
super().__init__(
n_arms=n_arms,
arm_ids=arm_ids,
randomize_ties=randomize_ties,
min_num_obs_per_arm=min_num_obs_per_arm,
)
self.estimate_variance = estimate_variance
self.min_variance = torch.tensor(min_variance)
self.alpha = alpha
@property
def var(self):
# return empirical variance of rewards for each arm
if self.estimate_variance:
return torch.fmax(
self.min_variance,
self.total_sum_reward_squared_per_arm / self.total_n_obs_per_arm
- ((self.total_sum_reward_per_arm / self.total_n_obs_per_arm) ** 2),
)
else:
return self.min_variance
class UCB1(BaseUCB):
"""
Canonical implementation of UCB1
Reference: https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf
"""
def get_scores(self) -> Tensor:
"""
Get per-arm UCB scores. The formula is
UCB_i = AVG([rewards_i]) + SQRT(2*LN(T)/N_i*VAR)
VAR=1 if estimate_variance==False, otherwise VAR=AVG([rewards_i**2]) - AVG([rewards_i])**2
Returns:
Tensor: An array of UCB scores (one per arm)
"""
avg_rewards = self.get_avg_reward_values()
log_t_over_ni = (
math.log(self.total_n_obs_all_arms + 1) / self.total_n_obs_per_arm
)
# pyre-fixme[6]: For 1st param expected `Tensor` but got `float`.
return avg_rewards + self.alpha * torch.sqrt(2 * log_t_over_ni * self.var)
class MetricUCB(BaseUCB):
"""
This is an improvement over UCB1 which uses a more precise confidence radius, especially for small expected rewards.
This algorithm has been constructed for Bernoulli reward distributions.
Reference: https://arxiv.org/pdf/0809.4882.pdf
"""
def get_scores(self) -> Tensor:
"""
Get per-arm UCB scores. The formula is
UCB_i = AVG([rewards_i]) + SQRT(AVG([rewards_i]) * LN(T+1)/N_i) + LN(T+1)/N_i
Returns:
Tensor: An array of UCB scores (one per arm)
"""
avg_rewards = self.get_avg_reward_values()
log_t_over_ni = (
math.log(self.total_n_obs_all_arms + 1) / self.total_n_obs_per_arm
)
return avg_rewards + self.alpha * (
torch.sqrt(avg_rewards * log_t_over_ni) + log_t_over_ni
)
class UCBTuned(BaseUCB):
"""
Implementation of the UCB-Tuned algorithm from Section 4 of https://link.springer.com/content/pdf/10.1023/A:1013689704352.pdf
Biggest difference from basic UCB is that per-arm reward variance is estimated.
IMPORTANT: This algorithm should only be used if the rewards of each arm have Bernoulli distribution.
"""
def get_scores(self) -> Tensor:
"""
Get per-arm UCB scores. The formula is
UCB_i = AVG([rewards_i]) + SQRT(LN(T)/N_i * min(V_i, 0.25))
where V_i is a conservative variance estimate of arm i:
V_i = AVG([rewards_i**2]) - AVG([rewards_i])**2 + sqrt(2ln(t) / n_i)
Returns:
Tensor: An array of UCB scores (one per arm)
"""
avg_rewards = self.get_avg_reward_values()
log_t_over_ni = (
math.log(self.total_n_obs_all_arms + 1) / self.total_n_obs_per_arm
)
per_arm_var_est = (
self.total_sum_reward_squared_per_arm / self.total_n_obs_per_arm
# pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
# `int`.
- avg_rewards**2
+ torch.sqrt(
# pyre-fixme[6]: For 1st param expected `Tensor` but got `float`.
2 * log_t_over_ni
) # additional term to make the estimate conservative (unlikely to underestimate)
)
return avg_rewards + self.alpha * torch.sqrt(
log_t_over_ni * torch.fmin(per_arm_var_est, torch.tensor(0.25))
)
def get_bernoulli_ucb_tuned_scores(
n_obs_per_arm: Tensor, num_success_per_arm: Tensor
) -> Tensor:
"""
a minimalistic function that implements UCB-Tuned for Bernoulli bandit
it's here only to benchmark execution time penalty incurred by the class-based implementation
"""
avg_rewards = num_success_per_arm / n_obs_per_arm
log_t_over_ni = torch.log(torch.sum(n_obs_per_arm)) / n_obs_per_arm
per_arm_var_est = (
avg_rewards
# pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
- avg_rewards**2
+ torch.sqrt(
2 * log_t_over_ni
) # additional term to make the estimate conservative (unlikely to underestimate)
)
# pyre-fixme[6]: For 2nd param expected `Tensor` but got `float`.
return avg_rewards + torch.sqrt(log_t_over_ni * torch.fmin(per_arm_var_est, 0.25))