-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
128 lines (101 loc) · 5.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from utils import evaluate_policy, render_policy, str2bool
from datetime import datetime
from Categorical_DQN import CDQN_agent
import gymnasium as gym
import os, shutil
import argparse
import torch
'''Hyperparameter Setting'''
parser = argparse.ArgumentParser()
parser.add_argument('--dvc', type=str, default='cuda', help='running device: cuda or cpu')
parser.add_argument('--EnvIdex', type=int, default=0, help='CP-v1, LLd-v2')
parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
parser.add_argument('--ModelIdex', type=int, default=320, help='which model to load')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--Max_train_steps', type=int, default=int(400e3), help='Max training steps')
parser.add_argument('--save_interval', type=int, default=int(20e3), help='Model saving interval, in steps.')
parser.add_argument('--eval_interval', type=int, default=int(2e3), help='Model evaluating interval, in steps.')
parser.add_argument('--random_steps', type=int, default=int(3e3), help='steps for random policy to explore')
parser.add_argument('--update_every', type=int, default=50, help='training frequency')
parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
parser.add_argument('--net_width', type=int, default=200, help='Hidden net width')
parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
parser.add_argument('--batch_size', type=int, default=256, help='lenth of sliced trajectory')
parser.add_argument('--exp_noise', type=float, default=0.2, help='explore noise')
parser.add_argument('--noise_decay', type=float, default=0.99, help='decay rate of explore noise')
parser.add_argument('--DQL', type=str2bool, default=True, help='Whether to use Double Q-learning')
parser.add_argument('--v_min', type=float, default=-100, help='Vmin')
parser.add_argument('--v_max', type=float, default=100, help='Vmax')
parser.add_argument('--n_atoms', type=int, default=51, help='number of atoms')
opt = parser.parse_args()
opt.dvc = torch.device(opt.dvc) # from str to torch.device
print(opt)
def main():
EnvName = ['CartPole-v1','LunarLander-v2']
BriefEnvName = ['CPV1', 'LLdV2']
env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
eval_env = gym.make(EnvName[opt.EnvIdex])
opt.state_dim = env.observation_space.shape[0]
opt.action_dim = env.action_space.n
opt.max_e_steps = env._max_episode_steps
opt.action_info = {0: ['Left', 'Right'], 1: ['Noop', 'LeftEngine', 'MainEngine', 'RightEngine']}
algo_name = 'C51_' + 'DDQN' if opt.DQL else 'DQN'
# Seed Everything
env_seed = opt.seed
torch.manual_seed(opt.seed)
torch.cuda.manual_seed(opt.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print("Random Seed: {}".format(opt.seed))
print('Algorithm:',algo_name,' Env:',BriefEnvName[opt.EnvIdex],' state_dim:',opt.state_dim,
' action_dim:',opt.action_dim,' Random Seed:',opt.seed, ' max_e_steps:',opt.max_e_steps, '\n')
if opt.write:
from torch.utils.tensorboard import SummaryWriter
timenow = str(datetime.now())[0:-10]
timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
writepath = 'runs/{}_{}'.format(algo_name,BriefEnvName[opt.EnvIdex]) + timenow
if os.path.exists(writepath): shutil.rmtree(writepath)
writer = SummaryWriter(log_dir=writepath)
#Build model and replay buffer
if not os.path.exists('model'): os.mkdir('model')
agent = CDQN_agent(**vars(opt))
if opt.Loadmodel: agent.load(algo_name,BriefEnvName[opt.EnvIdex],opt.ModelIdex)
if opt.render:
render_policy(env, agent, opt)
else:
total_steps = 0
while total_steps < opt.Max_train_steps:
s, info = env.reset(seed=env_seed) # Do not use opt.seed directly, or it can overfit to opt.seed
env_seed += 1
done = False
'''Interact & trian'''
while not done:
#e-greedy exploration
if total_steps < opt.random_steps: a = env.action_space.sample()
else: a = agent.select_action(s, deterministic=False)
s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
done = (dw or tr)
agent.replay_buffer.add(s, a, r, s_next, dw)
s = s_next
'''update if its time'''
# train 50 times every 50 steps rather than 1 training per step. Better!
if total_steps >= opt.random_steps and total_steps % opt.update_every == 0:
for j in range(opt.update_every): agent.train()
'''record & log'''
if total_steps % opt.eval_interval == 0:
agent.exp_noise *= opt.noise_decay
score = evaluate_policy(eval_env, agent, turns = 3)
if opt.write:
writer.add_scalar('ep_r', score, global_step=total_steps)
writer.add_scalar('noise', agent.exp_noise, global_step=total_steps)
print('EnvName:',BriefEnvName[opt.EnvIdex],'seed:',opt.seed,'steps: {}k'.format(int(total_steps/1000)),'score:', int(score))
total_steps += 1
'''save model'''
if total_steps % opt.save_interval == 0:
agent.save(algo_name,BriefEnvName[opt.EnvIdex],int(total_steps/1000))
env.close()
eval_env.close()
if __name__ == '__main__':
main()