forked from W4lrus/VIR-semestralka
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathppo.py
221 lines (174 loc) · 8.4 KB
/
ppo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import numpy as np
import torch as T
import my_utils
import airsim
import os
import NNQvalues
import tempfile
goal = [20, 20, 20]
startdistance_goal = np.sqrt(goal[0]^2 + goal[1]^2 + goal[2]^2)
def train(client, policy, params):
initX = 0
initY = 0
initZ = -10
policy_optim = T.optim.Adam(policy.parameters(), lr=params["policy_lr"], weight_decay=params["weight_decay"],
eps=1e-4)
batch_states = []
batch_actions = []
batch_rewards = []
batch_new_states = []
batch_terminals = []
batch_ctr = 0
batch_rew = 0
step_ctr = 0
for i in range (params["iters"]):
# Start flying
client.takeoffAsync().join()
client.moveToPositionAsync(initX, initY, initZ, 5).join()
# # Debug get images
# for i in range(2):
# responses = client.simGetImages([
# airsim.ImageRequest("0", airsim.ImageType.DepthVis)]) # depth visualization image
# # airsim.ImageRequest("1", airsim.ImageType.DepthPerspective, True), # depth in perspective projection
# # airsim.ImageRequest("1", airsim.ImageType.Scene), # scene vision image in png format
# # airsim.ImageRequest("1", airsim.ImageType.Scene, False, False)]) # scene vision image in uncompressed RGBA array
# tmp_dir = "C:\\Users\Filip\\PycharmProjects\\LearningDrone\\airsim_drone"
# for idx, response in enumerate(responses):
#
# filename = os.path.join(tmp_dir, str(idx))
#
# if response.pixels_as_float:
# print("Type %d, size %d" % (response.image_type, len(response.image_data_float)))
# airsim.write_pfm(os.path.normpath(filename + str(i) + '.pfm'), airsim.get_pfm_array(response))
# elif response.compress: # png format
# print("Type %d, size %d" % (response.image_type, len(response.image_data_uint8)))
# airsim.write_file(os.path.normpath(filename + str(i) + '.png'), response.image_data_uint8)
#
# client.moveByVelocityAsync(0, 5, 0, 2).join()
done = False
while not done:
# Get image for sample_action
responses = client.simGetImages([airsim.ImageRequest("1", airsim.ImageType.Scene, False, False)])
for idx, response in enumerate(responses):
# img1d = np.array(responses[0].image_data_float, dtype=np.float)
# img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
img1d = np.fromstring(response.image_data_uint8, dtype=np.uint8)
img_rgb = img1d.reshape(response.height, response.width, 3) # 144x256x3
img_rgb.transpose(2, 0, 1)
img_rgb = T.from_numpy(img_rgb).float()
# Sample action from policy
action = policy.sample_action(img_rgb.flatten()).detach()
# Record transition
batch_states.append(client.getMultirotorState().kinematics_estimated.position)
batch_actions.append(action.numpy())
action = action.numpy()
# Step action, rotate and move
if action[0] < 0:
action[0] = 0
t = action[0].item()
client.rotateByYawRateAsync(40, t) # Fixed rotation speed, action[0] = rotation span
velx = action[1].item()
vely = action[2].item()
client.moveByVelocityAsync(velx, vely, 0, 0.5).join() # Moving in the xy plane, fixed z
reward, done = compute_reward(client)
batch_rew += reward
step_ctr += 1
# Record transition
batch_rewards.append(my_utils.to_tensor(np.asarray(reward, dtype=np.float32), True))
batch_new_states.append(client.getMultirotorState().kinematics_estimated.position)
batch_terminals.append(done)
# Completed episode
batch_ctr += 1
if batch_ctr == params["batchsize"]:
batch_states = T.cat(batch_states)
batch_actions = T.cat(batch_actions)
batch_rewards = T.cat(batch_rewards)
# Scale rewards
batch_rewards = (batch_rewards - batch_rewards.mean()) / batch_rewards.std()
# Calculate episode advantages
batch_advantages = calc_advantages_MC(params["gamma"], batch_rewards, batch_terminals)
update_ppo(policy, policy_optim, batch_states, batch_actions, batch_advantages, params["ppo_update_iters"])
print("Episode {}/{}, loss_V: {}, loss_policy: {}, mean ep_rew: {}".
format(i, params["iters"], None, None, batch_rew / params["batchsize"]))
# Finally reset all batch lists
batch_ctr = 0
batch_rew = 0
batch_states = []
batch_actions = []
batch_rewards = []
batch_new_states = []
batch_terminals = []
# Saved learned model
# if i % 100 == 0 and i > 0:
# sdir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
# "agents/{}_{}_{}_pg.p".format(env.__class__.__name__, policy.__class__.__name__, params["ID"]))
# T.save(policy, sdir)
# print("Saved checkpoint at {} with params {}".format(sdir, params))
# Reset
client.reset()
# If computer vision mode enabled
# # Reset position
# # currently reset() doesn't work in CV mode. Below is the workaround
# client.simSetVehiclePose(airsim.Pose(airsim.Vector3r(0, 0, 0), airsim.to_quaternion(0, 0, 0)), True)
def update_ppo(policy, policy_optim, batch_states, batch_actions, batch_advantages, update_iters):
log_probs_old = policy.log_probs(batch_states, batch_actions).detach()
c_eps = 0.2
# Do ppo_update
for k in range(update_iters):
log_probs_new = policy.log_probs(batch_states, batch_actions)
r = T.exp(log_probs_new - log_probs_old)
loss = -T.mean(T.min(r * batch_advantages, r.clamp(1 - c_eps, 1 + c_eps) * batch_advantages))
policy_optim.zero_grad()
loss.backward()
policy.soft_clip_grads(3.)
policy_optim.step()
def calc_advantages_MC(gamma, batch_rewards, batch_terminals):
N = len(batch_rewards)
# Monte carlo estimate of targets
targets = []
for i in range(N):
cumrew = T.tensor(0.)
for j in range(i, N):
cumrew += (gamma ** (j - i)) * batch_rewards[j]
if batch_terminals[j]:
break
targets.append(cumrew.view(1, 1))
targets = T.cat(targets)
return targets
def transform_input(responses):
"""Transforms output of simGetImages to one 84x84 image"""
img1d = np.array(responses[0].image_data_float, dtype=np.float)
img1d = 255/np.maximum(np.ones(img1d.size), img1d)
img2d = np.reshape(img1d, (responses[0].height, responses[0].width))
from PIL import Image
image = Image.fromarray(img2d)
im_final = np.array(image.resize((84, 84)).convert('L'))
return im_final
def compute_reward(client):
# distance from goal
quad_state = client.getMultirotorState().kinematics_estimated.position
collision_info = client.simGetCollisionInfo()
reward = 0
done = False
distance_goal = np.sqrt((goal[0] - quad_state.x_val)*(goal[0] - quad_state.x_val) + (goal[1] - quad_state.y_val)*(goal[1] - quad_state.y_val))
if distance_goal < 1:
reward += 1000
done = True
reward -= -1
if collision_info.has_collided:
reward -= 1000
return reward, done
if __name__=="__main__":
# params = {"iters": 500000, "batchsize": 1, "gamma": 0.995, "policy_lr": 0.0007, "weight_decay": 0.0001, "ppo": True,
# "ppo_update_iters": 6, "animate": False, "train": True}
params = {"iters": 100, "batchsize": 1, "gamma": 0.995, "policy_lr": 0.0007, "weight_decay": 0.0001, "ppo_update_iters": 6, "train": True}
# Airsim environment
client = airsim.MultirotorClient()
client.confirmConnection()
client.enableApiControl(True)
client.armDisarm(True)
policy = NNQvalues.Policy()
train(client, policy, params)
# Quit
# that's enough fun for now. let's quit cleanly
client.enableApiControl(False)