fixed to work with modern Pytorch

jamesrichter · Aug 25, 2020 · ead9d92 · ead9d92
1 parent 7e2f170
commit ead9d92
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 12 deletions.
diff --git a/1_dqn.py b/1_dqn.py
@@ -93,7 +93,7 @@ def run_episode(episode, env):
     while True:
         # env.render()
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         # negative reward when attempt ends
         if done:
@@ -136,14 +136,14 @@ def learn():
 
     batch_state = Variable(torch.cat(batch_state))
     batch_action = Variable(torch.cat(batch_action))
-    batch_reward = Variable(torch.cat(batch_reward))
+    batch_reward = Variable(torch.cat(batch_reward)).unsqueeze(-1)
     batch_next_state = Variable(torch.cat(batch_next_state))
 
     # current Q values are estimated by NN for all actions
     current_q_values = model(batch_state).gather(1, batch_action)
     # expected Q values are estimated from actions which gives maximum Q value
     max_next_q_values = model(batch_next_state).detach().max(1)[0]
-    expected_q_values = batch_reward + (GAMMA * max_next_q_values)
+    expected_q_values = batch_reward + (GAMMA * max_next_q_values).unsqueeze(-1)
 
     # loss is measured from error between current and newly expected Q values
     loss = F.smooth_l1_loss(current_q_values, expected_q_values)
@@ -161,7 +161,7 @@ def botPlay():
         frame = env.render(mode='rgb_array')
         frames.append(frame)
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         state = next_state
         steps += 1

diff --git a/2_double_dqn.py b/2_double_dqn.py
@@ -95,7 +95,7 @@ def run_episode(episode, env):
     while True:
         # env.render()
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         # negative reward when attempt ends
         if done:
@@ -138,14 +138,14 @@ def learn():
 
     batch_state = Variable(torch.cat(batch_state))
     batch_action = Variable(torch.cat(batch_action))
-    batch_reward = Variable(torch.cat(batch_reward))
+    batch_reward = Variable(torch.cat(batch_reward)).unsqueeze(-1)
     batch_next_state = Variable(torch.cat(batch_next_state))
 
     # current Q values are estimated by NN for all actions
     current_q_values = model(batch_state).gather(1, batch_action)
     # expected Q values are estimated from actions which gives maximum Q value
     max_next_q_values = target(batch_next_state).detach().max(1)[0]
-    expected_q_values = batch_reward + (GAMMA * max_next_q_values)
+    expected_q_values = batch_reward + (GAMMA * max_next_q_values).unsqueeze(-1)
 
     # loss is measured from error between current and newly expected Q values
     loss = F.smooth_l1_loss(current_q_values, expected_q_values)
@@ -163,7 +163,7 @@ def botPlay():
         frame = env.render(mode='rgb_array')
         frames.append(frame)
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         state = next_state
         steps += 1

diff --git a/3_dueling_dqn.py b/3_dueling_dqn.py
@@ -94,7 +94,7 @@ def run_episode(episode, env):
     while True:
         # env.render()
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         # negative reward when attempt ends
         if done:
@@ -137,14 +137,14 @@ def learn():
 
     batch_state = Variable(torch.cat(batch_state))
     batch_action = Variable(torch.cat(batch_action))
-    batch_reward = Variable(torch.cat(batch_reward))
+    batch_reward = Variable(torch.cat(batch_reward)).unsqueeze(-1)
     batch_next_state = Variable(torch.cat(batch_next_state))
 
     # current Q values are estimated by NN for all actions
     current_q_values = model(batch_state).gather(1, batch_action)
     # expected Q values are estimated from actions which gives maximum Q value
     max_next_q_values = model(batch_next_state).detach().max(1)[0]
-    expected_q_values = batch_reward + (GAMMA * max_next_q_values)
+    expected_q_values = batch_reward + (GAMMA * max_next_q_values).unsqueeze(-1)
 
     # loss is measured from error between current and newly expected Q values
     loss = F.smooth_l1_loss(current_q_values, expected_q_values)
@@ -162,7 +162,7 @@ def botPlay():
         frame = env.render(mode='rgb_array')
         frames.append(frame)
         action = select_action(FloatTensor([state]))
-        next_state, reward, done, _ = env.step(action[0, 0])
+        next_state, reward, done, _ = env.step(action[0, 0].item())
 
         state = next_state
         steps += 1

diff --git a/4_policy_gradient_play.gif b/4_policy_gradient_play.gif
diff --git a/4_policy_gradient_score.png b/4_policy_gradient_score.png