diff --git a/clean_pufferl.py b/clean_pufferl.py index 4bf6f1b4..b9ede3b8 100644 --- a/clean_pufferl.py +++ b/clean_pufferl.py @@ -119,7 +119,7 @@ def evaluate(data): with profile.eval_misc: value = value.flatten() actions = actions.cpu().numpy() - mask = torch.as_tensor(mask)# * policy.mask) + mask = torch.as_tensor(mask) o = o if config.cpu_offload else o_device experience.store(o, value, actions, logprob, r, d, env_id, mask) @@ -411,7 +411,7 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, hidden_size, self.dones=torch.zeros(batch_size, pin_memory=pin) self.truncateds=torch.zeros(batch_size, pin_memory=pin) self.values=torch.zeros(batch_size, pin_memory=pin) - self.e3b_inv = 1*torch.eye(hidden_size).repeat(lstm_total_agents, 1, 1).to(device) + self.e3b_inv = 10*torch.eye(hidden_size).repeat(lstm_total_agents, 1, 1).to(device) self.actions_np = np.asarray(self.actions) self.logprobs_np = np.asarray(self.logprobs) diff --git a/pufferlib/models.py b/pufferlib/models.py index 5d9e8b29..5ff66239 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -86,16 +86,16 @@ def decode_actions(self, hidden, lookup, concat=True, e3b=None): batch = hidden.shape[0] return probs, value - intrinsic_reward = None + b = None if e3b is not None: phi = hidden.detach() - intrinsic_reward = (phi.unsqueeze(1) @ e3b @ phi.unsqueeze(2)) - e3b = 0.95*e3b - (phi.unsqueeze(2) @ phi.unsqueeze(1))/(1 + intrinsic_reward) - intrinsic_reward = intrinsic_reward.squeeze() - intrinsic_reward = 0.1*torch.clamp(intrinsic_reward, -1, 1) + u = phi.unsqueeze(1) @ e3b + b = u @ phi.unsqueeze(2) + e3b = 0.99*e3b - (u.mT @ u) / (1 + b) + b = b.squeeze() actions = self.decoder(hidden) - return actions, value, e3b, intrinsic_reward + return actions, value, e3b, b class LSTMWrapper(nn.Module): def __init__(self, env, policy, input_size=128, hidden_size=128, num_layers=1):