-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_td3.py
310 lines (260 loc) · 11.8 KB
/
run_td3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
from pprint import pprint
from ERP_env2 import ERP
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
from torch.utils.tensorboard import SummaryWriter
import numpy
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, hidden_width, max_action):
super(Actor, self).__init__()
self.max_action = max_action
self.l1 = nn.Linear(state_dim, hidden_width)
self.l2 = nn.Linear(hidden_width, hidden_width)
self.l3 = nn.Linear(hidden_width, action_dim)
def forward(self, s):
s = F.relu(self.l1(s))
s = F.relu(self.l2(s))
# a = self.max_action * torch.tanh(self.l3(s)) # [-max,max]
a = self.l3(s)
# print('a', a)
# 限制a的大小
# a_max = max(a)
# a = a.numpy()
a = a.detach().numpy()
abs_max_a = max(abs(a))
a = a / abs_max_a
a_0 = int(a[0] * 200)
a_1 = int(a[1] * 5)
a_2 = int(a[2] * 4)
a_3 = int(a[3] * 2)
a_4 = int(a[4] * 1)
a = [a_0, a_1, a_2, a_3, a_4]
a = torch.from_numpy(a)
return a
class Critic(nn.Module): # According to (s,a), directly calculate Q(s,a)
def __init__(self, state_dim, action_dim, hidden_width):
super(Critic, self).__init__()
# Q1
self.l1 = nn.Linear(state_dim + action_dim, hidden_width)
self.l2 = nn.Linear(hidden_width, hidden_width)
self.l3 = nn.Linear(hidden_width, 1)
# Q2
self.l4 = nn.Linear(state_dim + action_dim, hidden_width)
self.l5 = nn.Linear(hidden_width, hidden_width)
self.l6 = nn.Linear(hidden_width, 1)
def forward(self, s, a):
s_a = torch.cat([s, a], 1)
q1 = F.relu(self.l1(s_a))
q1 = F.relu(self.l2(q1))
q1 = self.l3(q1)
q2 = F.relu(self.l4(s_a))
q2 = F.relu(self.l5(q2))
q2 = self.l6(q2)
return q1, q2
def Q1(self, s, a):
s_a = torch.cat([s, a], 1)
q1 = F.relu(self.l1(s_a))
q1 = F.relu(self.l2(q1))
q1 = self.l3(q1)
return q1
class ReplayBuffer(object):
def __init__(self, state_dim, action_dim):
self.max_size = int(1e6)
self.count = 0
self.size = 0
self.s = np.zeros((self.max_size, state_dim))
self.a = np.zeros((self.max_size, action_dim))
self.r = np.zeros((self.max_size, 1))
self.s_ = np.zeros((self.max_size, state_dim))
self.dw = np.zeros((self.max_size, 1))
def store(self, s, a, r, s_, dw):
self.s[self.count] = s
self.a[self.count] = a
self.r[self.count] = r
self.s_[self.count] = s_
self.dw[self.count] = dw
self.count = (self.count + 1) % self.max_size # When the 'count' reaches max_size, it will be reset to 0.
self.size = min(self.size + 1, self.max_size) # Record the number of transitions
def sample(self, batch_size):
index = np.random.choice(self.size, size=batch_size) # Randomly sampling
batch_s = torch.tensor(self.s[index], dtype=torch.float)
batch_a = torch.tensor(self.a[index], dtype=torch.float)
batch_r = torch.tensor(self.r[index], dtype=torch.float)
batch_s_ = torch.tensor(self.s_[index], dtype=torch.float)
batch_dw = torch.tensor(self.dw[index], dtype=torch.float)
return batch_s, batch_a, batch_r, batch_s_, batch_dw
class TD3(object):
def __init__(self, state_dim, action_dim, max_action):
self.max_action = max_action
self.hidden_width = 256 # The number of neurons in hidden layers of the neural network
self.batch_size = 256 # batch size
self.GAMMA = 0.99 # discount factor
self.TAU = 0.005 # Softly update the target network
self.lr = 3e-4 # learning rate
self.policy_noise = 0.2 * max_action # The noise for the trick 'target policy smoothing'
self.noise_clip = 0.5 * max_action # Clip the noise
self.policy_freq = 2 # The frequency of policy updates
self.actor_pointer = 0
self.actor = Actor(state_dim, action_dim, self.hidden_width, max_action)
self.actor_target = copy.deepcopy(self.actor)
self.critic = Critic(state_dim, action_dim, self.hidden_width)
self.critic_target = copy.deepcopy(self.critic)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
def choose_action(self, s):
s = torch.unsqueeze(torch.tensor(s, dtype=torch.float), 0)
a = self.actor(s).data.numpy().flatten()
return a
def learn(self, relay_buffer):
self.actor_pointer += 1
batch_s, batch_a, batch_r, batch_s_, batch_dw = relay_buffer.sample(self.batch_size) # Sample a batch
# Compute the target Q
with torch.no_grad(): # target_Q has no gradient
# Trick 1:target policy smoothing
# torch.randn_like can generate random numbers sampled from N(0,1),which have the same size as 'batch_a'
noise = (torch.randn_like(batch_a) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
next_action = (self.actor_target(batch_s_) + noise).clamp(-self.max_action, self.max_action)
# Trick 2:clipped double Q-learning
target_Q1, target_Q2 = self.critic_target(batch_s_, next_action)
# print(target_Q1.size())
# print(target_Q2.size())
target_Q = batch_r + self.GAMMA * (1 - batch_dw) * torch.min(target_Q1, target_Q2)
# print(target_Q.size())
# Get the current Q
current_Q1, current_Q2 = self.critic(batch_s, batch_a)
# Compute the critic loss
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# Optimize the critic
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Trick 3:delayed policy updates
if self.actor_pointer % self.policy_freq == 0:
# Freeze critic networks so you don't waste computational effort
for params in self.critic.parameters():
params.requires_grad = False
# Compute actor loss
actor_loss = -self.critic.Q1(batch_s, self.actor(batch_s)).mean() # Only use Q1
# Optimize the actor
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Unfreeze critic networks
for params in self.critic.parameters():
params.requires_grad = True
# Softly update the target networks
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)
def evaluate_policy(env, agent):
try:
times = 3 # Perform three evaluations and calculate the average
evaluate_reward = 0
for _ in range(times):
s = env.reset()
done = False
episode_reward = 0
while not done:
a = agent.choose_action(s) # We do not add noise when evaluating
s_, r, done, _ = env.step(a)
episode_reward += r
s = s_
evaluate_reward += episode_reward
# print(int(evaluate_reward / times))
return int(evaluate_reward / times)
except:
# print('reward is NaN')
pass
def reward_adapter(r, env_index):
if env_index == 0: # Pendulum-v1
r = (r + 8) / 8
elif env_index == 1: # BipedalWalker-v3
if r <= -100:
r = -1
return r
if __name__ == '__main__':
# env_name = ['Pendulum-v1', 'BipedalWalker-v3', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2']
# env_index = 2
# env = gym.make(env_name[env_index])
# env_evaluate = gym.make(env_name[env_index]) # When evaluating the policy, we need to rebuild an environment
# number = 1
# Set random seed
#our
env = ERP()
env_evaluate = ERP()
seed = 0
# env.seed(seed)
# env.action_space.seed(seed)
# env_evaluate.seed(seed)
# env_evaluate.action_space.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
state_dim = 12
action_dim = 5
max_action = 2
max_episode_steps = 120 # Maximum number of steps per episode
# print("env={}".format(env_name[env_index]))
print("state_dim={}".format(state_dim))
print("action_dim={}".format(action_dim))
print("max_action={}".format(max_action))
print("max_episode_steps={}".format(max_episode_steps))
agent = TD3(state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer(state_dim, action_dim)
# Build a tensorboard
writer = SummaryWriter(log_dir='runs/TD3/TD3_env_seed_{}'.format(seed))
noise_std = 0.1 * max_action # the std of Gaussian noise for exploration
max_train_steps = 3e4 # Maximum number of training steps
random_steps = 3000 # Take the random actions in the beginning for the better exploration
evaluate_freq = 5e3 # Evaluate the policy every 'evaluate_freq' steps
evaluate_num = 0 # Record the number of evaluations
evaluate_rewards = [] # Record the rewards during the evaluating
total_steps = 0 # Record the total steps during the training
while total_steps < max_train_steps:
s = env.reset()
episode_steps = 0
done = False
while not done:
episode_steps += 1
# if total_steps < random_steps: # Take random actions in the beginning for the better exploration
# # a = env.action_space.sample()
# # pass
# a = agent.choose_action(s)
# # a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
# else:
# # Add Gaussian noise to action for exploration
# a = agent.choose_action(s)
# # a = (a + np.random.normal(0, noise_std, size=action_dim)).clip(-max_action, max_action)
a = agent.choose_action(s)
#限制a的大小
# a_max =
s_, r, done, _ = env.step(a)
print('s, a, s_, r, done, _', s, a, s_, r, done, _)
# r = reward_adapter(r, env_index) # Adjust rewards for better performance
# When dead or win or reaching the max_episode_steps, done will be Ture, we need to distinguish them;
# dw means dead or win,there is no next state s';
# but when reaching the max_episode_steps,there is a next state s' actually.
if done and episode_steps != max_episode_steps:
dw = True
else:
dw = False
replay_buffer.store(s, a, r, s_, dw) # Store the transition
s = s_
# Update one step
if total_steps >= random_steps:
agent.learn(replay_buffer)
# Evaluate the policy every 'evaluate_freq' steps
if (total_steps + 1) % evaluate_freq == 0:
evaluate_num += 1
evaluate_reward = evaluate_policy(env_evaluate, agent)
evaluate_rewards.append(evaluate_reward)
print("evaluate_num:{} \t evaluate_reward:{}".format(evaluate_num, evaluate_reward))
# writer.add_scalar('step_rewards_{}'.format('env_name[env_index])', evaluate_reward, global_step=total_steps)
# Save the rewards
if evaluate_num % 10 == 0:
np.save('./data_train/TD3', np.array(evaluate_rewards))
total_steps += 1