forked from SergeOlivierP/agentF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy.py
54 lines (45 loc) · 2.16 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
class Policy:
def __init__(self, H, D, gamma, batch_size, decay_rate, learning_rate):
self.batch_size = batch_size
self.learning_rate = learning_rate
self.decay_rate = decay_rate
self.gamma = gamma
self.model = {}
self.model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization
self.model['W2'] = np.random.randn(H) / np.sqrt(H)
self.grad_buffer = {k: np.zeros_like(v) for k, v in self.model.items()}
self.rmsprop_cache = {k: np.zeros_like(v) for k, v in self.model.items()}
def sigmoid(self, x):
return 1.0 / (1.0 + np.exp(-x))
def discount_rewards(self, r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, r.size)):
running_add = running_add * self.gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def forward(self, x):
h = np.dot(self.model['W1'], x)
h[h < 0] = 0 # ReLU nonlinearity
logp = np.dot(self.model['W2'], h)
p = self.sigmoid(logp)
return p, h # return probability of taking action 2, and hidden state
def backward(self, eph, epdlogp, epx):
""" backward pass. (eph is array of intermediate hidden states) """
dW2 = np.dot(eph.T, epdlogp).ravel()
dh = np.outer(epdlogp, self.model['W2'])
dh[eph <= 0] = 0 # backpro prelu
dW1 = np.dot(dh.T, epx)
return {'W1': dW1, 'W2': dW2}
def update(self, grad, episode_number):
for k in self.model:
self.grad_buffer[k] += grad[k]
# perform rmsprop parameter update every batch_size episodes
if episode_number % self.batch_size == 0:
for k, v in self.model.items():
g = self.grad_buffer[k]
self.rmsprop_cache[k] = self.decay_rate * self.rmsprop_cache[k] + (1 - self.decay_rate) * g**2
self.model[k] += self.learning_rate * g / (np.sqrt(self.rmsprop_cache[k]) + 1e-5)
self.grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer