-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrunner.py
134 lines (119 loc) · 7.78 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
from alg_parameters import *
from util import swap_flat
class Runner(object):
"""Run multiple episode in multiprocessing environments."""
def __init__(self, env, model):
"""Initialize environments"""
self.env = env
self.model = model
self.episode_rewards = np.zeros((N_ENVS,))
self.env.reset()
self.obs = env.get_obs()
self.state = env.get_state()
self.state = np.repeat(self.state, N_AGENTS, axis=1)
self.cent_state = np.concatenate((self.obs, self.state), axis=-1)
self.dones = [False for _ in range(N_ENVS)]
self.actor_hidden_state_c = np.zeros((N_ENVS * N_AGENTS, ACTOR_LAYER2))
self.actor_hidden_state_h = np.zeros((N_ENVS * N_AGENTS, ACTOR_LAYER2))
self.critic_hidden_state_c = np.zeros((N_ENVS * N_AGENTS, CRITIC_LAYER2))
self.critic_hidden_state_h = np.zeros((N_ENVS * N_AGENTS, CRITIC_LAYER2))
def run(self):
# Use to store experiences
mb_obs, mb_cent_state, mb_rewards, mb_values, mb_dones, \
ep_infos, mb_actions, mb_ps = [], [], [], [], [], [], [], []
mb_actor_hidden_state_c, mb_actor_hidden_state_h, \
mb_critic_hidden_state_c, mb_critic_hidden_state_h = [], [], [], []
episode_rewrads_info = []
for _ in range(N_STEPS):
mb_obs.append(self.obs)
mb_cent_state.append(self.cent_state)
mb_actor_hidden_state_c.append(self.actor_hidden_state_c)
mb_critic_hidden_state_c.append(self.critic_hidden_state_c)
mb_actor_hidden_state_h.append(self.actor_hidden_state_h)
mb_critic_hidden_state_h.append(self.critic_hidden_state_h)
valid_action = self.env.get_avail_actions()
actions, values, ps, critic_hidden_state, actor_hidden_state = self.model.step(self.obs, self.cent_state,
valid_action,
self.critic_hidden_state_c,
self.critic_hidden_state_h,
self.actor_hidden_state_c,
self.actor_hidden_state_h)
self.critic_hidden_state_c, self.critic_hidden_state_h = critic_hidden_state
self.actor_hidden_state_c, self.actor_hidden_state_h = actor_hidden_state
mb_values.append(values)
mb_ps.append(ps)
mb_actions.append(actions)
mb_dones.append(self.dones)
rewards, self.dones, infos = self.env.step(actions)
self.obs = self.env.get_obs()
self.state = self.env.get_state()
self.state = np.repeat(self.state, N_AGENTS, axis=1)
self.cent_state = np.concatenate((self.obs, self.state), axis=-1)
self.episode_rewards += rewards
true_index = np.argwhere(self.dones)
if len(true_index) != 0:
# Initialize memory
true_index = np.squeeze(true_index)
self.actor_hidden_state_c = np.reshape(self.actor_hidden_state_c, (-1, N_AGENTS, ACTOR_LAYER2))
self.actor_hidden_state_h = np.reshape(self.actor_hidden_state_h, (-1, N_AGENTS, ACTOR_LAYER2))
self.critic_hidden_state_c = np.reshape(self.critic_hidden_state_c, (-1, N_AGENTS, CRITIC_LAYER2))
self.critic_hidden_state_h = np.reshape(self.critic_hidden_state_h, (-1, N_AGENTS, CRITIC_LAYER2))
self.actor_hidden_state_c[true_index] = np.zeros(self.actor_hidden_state_c[true_index].shape)
self.actor_hidden_state_h[true_index] = np.zeros(self.actor_hidden_state_h[true_index].shape)
self.critic_hidden_state_c[true_index] = np.zeros(self.critic_hidden_state_c[true_index].shape)
self.critic_hidden_state_h[true_index] = np.zeros(self.critic_hidden_state_h[true_index].shape)
self.actor_hidden_state_c = np.reshape(self.actor_hidden_state_c, (-1, ACTOR_LAYER2))
self.actor_hidden_state_h = np.reshape(self.actor_hidden_state_h, (-1, ACTOR_LAYER2))
self.critic_hidden_state_c = np.reshape(self.critic_hidden_state_c,
(-1, CRITIC_LAYER2))
self.critic_hidden_state_h = np.reshape(self.critic_hidden_state_h,
(-1, CRITIC_LAYER2))
# Record information of the episode when it ends
episode_rewrads_info.append(np.nanmean(self.episode_rewards[true_index]))
self.episode_rewards[true_index] = np.zeros(self.episode_rewards[true_index].shape)
if true_index.shape == ():
ep_infos.append(infos[true_index])
else:
for item in true_index:
ep_infos.append(infos[item])
mb_rewards.append(rewards)
mb_obs = np.asarray(mb_obs, dtype=np.float32)
mb_cent_state = np.asarray(mb_cent_state, dtype=np.float32)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
mb_rewards = np.expand_dims(mb_rewards, axis=-1)
mb_rewards = np.repeat(mb_rewards, N_AGENTS, axis=-1)
mb_values = np.asarray(mb_values, dtype=np.float32)
mb_dones = np.asarray(mb_dones, dtype=np.bool)
mb_actions = np.asarray(mb_actions, dtype=np.int32)
mb_ps = np.asarray(mb_ps, dtype=np.float32)
mb_actor_hidden_state_c = np.asarray(mb_actor_hidden_state_c, dtype=np.float32)
mb_actor_hidden_state_h = np.asarray(mb_actor_hidden_state_h, dtype=np.float32)
mb_critic_hidden_state_c = np.asarray(mb_critic_hidden_state_c, dtype=np.float32)
mb_critic_hidden_state_h = np.asarray(mb_critic_hidden_state_h, dtype=np.float32)
mb_actor_hidden_state_c = np.reshape(mb_actor_hidden_state_c, (N_STEPS, -1, N_AGENTS, ACTOR_LAYER2))
mb_actor_hidden_state_h = np.reshape(mb_actor_hidden_state_h, (N_STEPS, -1, N_AGENTS, ACTOR_LAYER2))
mb_critic_hidden_state_c = np.reshape(mb_critic_hidden_state_c, (N_STEPS, -1, N_AGENTS, CRITIC_LAYER2))
mb_critic_hidden_state_h = np.reshape(mb_critic_hidden_state_h, (N_STEPS, -1, N_AGENTS, CRITIC_LAYER2))
# Calculate advantages
last_values = self.model.value(self.cent_state, self.critic_hidden_state_c,
self.critic_hidden_state_h)
mb_advs = np.zeros_like(mb_rewards)
last_gae_lam = 0
for t in reversed(range(N_STEPS)):
if t == N_STEPS - 1:
next_nonterminal = 1.0 - self.dones
next_nonterminal = np.expand_dims(next_nonterminal, axis=1)
next_nonterminal = np.repeat(next_nonterminal, N_AGENTS, axis=1)
next_values = last_values
else:
next_nonterminal = 1.0 - mb_dones[t + 1]
next_nonterminal = np.expand_dims(next_nonterminal, axis=1)
next_nonterminal = np.repeat(next_nonterminal, N_AGENTS, axis=1)
next_values = mb_values[t + 1]
delta = mb_rewards[t] + GAMMA * next_values * next_nonterminal - mb_values[t]
mb_advs[t] = last_gae_lam = delta + GAMMA * LAM * next_nonterminal * last_gae_lam
mb_returns = mb_advs + mb_values
return (*map(swap_flat, (mb_obs, mb_cent_state, mb_returns, mb_values, mb_actions, mb_ps,
mb_critic_hidden_state_c, mb_critic_hidden_state_h, mb_actor_hidden_state_c,
mb_actor_hidden_state_h)), ep_infos, np.nanmean(episode_rewrads_info))