-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
179 lines (152 loc) · 7.5 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from scripts.buffer import ReplayBuffer, PrioritizedReplay
import gym
import random
import numpy as np
import torch
import pybullet_envs # to run e.g. HalfCheetahBullet-v0 different reward function bullet-v0 starts ~ -1500. pybullet-v0 starts at 0
from collections import deque
import time
from torch.utils.tensorboard import SummaryWriter
import argparse
import json
from scripts.agent import TD3_Agent
def timer(start,end):
""" Helper to print training time """
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("\nTraining Time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
def evaluate(step, eval_runs=5, capture=False):
"""
Makes an evaluation run with the current epsilon
"""
reward_batch = []
for i in range(eval_runs):
state = eval_env.reset()
rewards = 0
while True:
action = agent.eval(np.expand_dims(state, axis=0))
action_v = np.clip(action, action_low, action_high)
state, reward, done, _ = eval_env.step(action_v)
rewards += reward
if done:
break
reward_batch.append(rewards)
if capture == False:
writer.add_scalar("Test_Reward", np.mean(reward_batch), step)
def fill_buffer(agent, env, samples=1000):
collected_samples = 0
state = env.reset()
state = state.reshape((1, state_size))
for i in range(samples):
action = env.action_space.sample()
next_state, reward, done, info = env.step(action)
next_state = next_state.reshape((1, state_size))
agent.memory.add(state, action, reward, next_state, done)
collected_samples += 1
state = next_state
if done:
state = env.reset()
state = state.reshape((1, state_size))
print("Added random samples to the Buffer - Buffer size: ", agent.memory.__len__())
def train(args):
scores_deque = deque(maxlen=100)
average_100_scores = []
scores = []
i_episode = 1
state = env.reset()
state = state.reshape((1, state_size))
score = 0
steps = args.steps
for step in range(1, steps+1):
# eval runs
if step % args.eval_every == 0 or step == 1:
evaluate(step, args.eval_runs)
action = agent.act(state)
action_v = action.numpy()
action_v = np.clip(action_v, action_low, action_high)
next_state, reward, done, info = env.step(action_v)
next_state = next_state.reshape((1, state_size))
agent.step(state, action, reward, next_state, done)
state = next_state
score += reward
if done:
scores_deque.append(score)
scores.append(score)
average_100_scores.append(np.mean(scores_deque))
writer.add_scalar("Average100", np.mean(scores_deque), step)
writer.add_scalar("Train_Reward", score, step)
state = env.reset()
state = state.reshape((1, state_size))
print('\rEpisode {} Env. Step: [{}/{}] Train-Reward: {:.2f} Average100 Score: {:.2f} '.format(i_episode, step, steps, score, np.mean(scores_deque)), end="")
if i_episode % args.print_every == 0:
print('\rEpisode {} Env. Step: [{}/{}] Train-Reward: {:.2f} Average100 Score: {:.2f}'.format(i_episode, step, steps, score, np.mean(scores_deque)))
score = 0
i_episode += 1
return np.mean(scores_deque)
parser = argparse.ArgumentParser(description="")
parser.add_argument("--env", type=str,default="HalfCheetahBulletEnv-v0", help="Environment name, default = HalfCheetahBulletEnv-v0")
parser.add_argument("--info", type=str, default="TD3-training-run-1", help="Information or name of the run")
parser.add_argument("--steps", type=int, default=1_000_000, help="The amount of training interactions with the environment, default is 1mio")
parser.add_argument("--collect_random", type=int, default=5_000, help="Collect transitions of the envrionment with a random policy before training, default is 5000 transitions")
parser.add_argument("--eval_every", type=int, default=10_000, help="Number of interactions after which the evaluation runs are performed, default = 10.000")
parser.add_argument("--eval_runs", type=int, default=1, help="Number of evaluation runs performed, default = 1")
parser.add_argument("--seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0")
parser.add_argument("--nstep", type=int, default=1, help="Using Multistep Q-Learning, default n-step is 1")
parser.add_argument("--per", type=int, default=0, choices=[0,1], help="Using Prioritized Experience Replay if set to 1, default is 0")
parser.add_argument("--lr", type=float, default=3e-4, help="Actor learning rate of adapting the network weights, default is 3e-4")
parser.add_argument("--layer_size", type=int, default=256, help="Number of nodes per neural network layer, default is 256")
parser.add_argument("--replay_memory", type=int, default=int(1e6), help="Size of the Replay memory, default is 1e6")
parser.add_argument("-bs", "--batch_size", type=int, default=256, help="Batch size, default is 256")
parser.add_argument("-t", "--tau", type=float, default=0.005, help="Softupdate factor tau, default is 0.005")
parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99")
parser.add_argument("--print_every", type=int, default=100, help="Print recent training results every x epochs, defaut is 100")
args = parser.parse_args()
if __name__ == "__main__":
writer = SummaryWriter("runs/"+args.info)
env = gym.make(args.env)
eval_env = gym.make(args.env)
action_high = env.action_space.high[0]
seed = args.seed
action_low = env.action_space.low[0]
torch.manual_seed(seed)
env.seed(seed)
eval_env.seed(seed+1)
np.random.seed(seed)
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if args.per == 0:
replay_buffer = ReplayBuffer(buffer_size=args.replay_memory,
batch_size=args.batch_size,
seed=seed,
gamma=args.gamma,
n_step=args.nstep,
device=device)
else:
replay_buffer = PrioritizedReplay(capacity=args.replay_memory,
batch_size=args.batch_size,
device=device,
seed=seed,
gamma=args.gamma,
beta_frames=args.steps,
n_step=args.nstep)
agent = TD3_Agent(args=args,
state_size=state_size,
action_size=action_size,
action_low=action_low,
action_high=action_high,
replay_buffer=replay_buffer,
device=device
)
fill_buffer(agent, env=env, samples=args.collect_random)
t0 = time.time()
final_average100 = train(args)
t1 = time.time()
env.close()
timer(t0, t1)
# save parameter
with open('runs/'+args.info+".json", 'w') as f:
json.dump(args.__dict__, f, indent=2)
hparams = vars(args)
metric = {"final average 100 train reward": final_average100}
writer.add_hparams(hparams, metric)