-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicyInitializer.py
112 lines (94 loc) · 3.13 KB
/
policyInitializer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import gzip
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
'''Load Data'''
def load_data(t):
with gzip.GzipFile('env-{}.json'.format(t), 'r') as fin:
json_bytes = fin.read()
print('file read')
json_str = json_bytes.decode('utf-8')
print('string decoded')
data = json.loads(json_str)
print('json parsed')
data.extend(json.loads(gzip.GzipFile('env-train-2.json').read().decode('utf-8')))
print('second file')
data.extend(json.loads(gzip.GzipFile('env-train-3.json').read().decode('utf-8')))
print('third file')
data.extend(json.loads(gzip.GzipFile('env-train-4.json').read().decode('utf-8')))
print('fourth file')
data.extend(json.loads(gzip.GzipFile('env-train-5.json').read().decode('utf-8')))
print('all files loaded', len(data))
# data = data[:3000000]
np.random.shuffle(data)
print('shuffled')
# Prepare
act_vectors = np.eye(88)
j = 0
_x = []
_y = []
# Populate data
while data:
d = data[-1]
if d['r'] == 0.5:
_x.append(d['obs'])
_y.append(act_vectors[d['act']])
j += 1
if j % 1000 == 0:
sys.stdout.write(str(j/1000) + ' ')
sys.stdout.flush()
del data[-1]
# Transform to numpy array
_x = np.asarray(_x)
_y = np.asarray(_y)
# Calculate mean and std
if t == 'train':
_mean = _x.mean(axis=0)
_std = _x.std(axis=0)
return _x, _y, j, _mean, _std
return _x, _y, j
x, y, total, mean, std = load_data('train')
print(total)
'''Load Data Finished'''
sess = tf.Session()
n_state = 25
n_action = 88
# Policy network start
state = tf.placeholder(shape=[None, n_state], dtype=tf.float32)
_state = tf.divide(tf.subtract(state, mean), std)
w1 = tf.get_variable("w1", shape=[n_state, 96])
w2 = tf.get_variable("w2", shape=[96, 96])
w3 = tf.get_variable("w3", shape=[96, n_action])
saver = tf.train.Saver({"w1": w1, "w2": w2, "w3": w3})
o1 = tf.matmul(_state, w1)
h1 = tf.math.softmax(o1)
o2 = tf.matmul(h1, w2)
h2 = tf.math.softmax(o2)
action_logits = tf.matmul(h2, w3)
action_prob = tf.math.softmax(action_logits)
# action_dist = tfp.distributions.Categorical(probs=action_prob[0])
action_one_hot = tf.placeholder(shape=[None, n_action], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(tf.subtract(action_one_hot, action_prob)), 1)
mean_loss, std_loss = tf.nn.moments(loss, 0)
rms = tf.train.RMSPropOptimizer(0.001)
opt = rms.minimize(mean_loss)
# Initialization & policy definition
sess.run(tf.global_variables_initializer())
to_plot = []
to_plot_std = []
i = 0
while i < total:
end = min(i+128, total)
feed_dict = {state: x[i:end], action_one_hot: y[i:end]}
loss_m_v, loss_std_v, _ = sess.run([mean_loss, std_loss, opt], feed_dict=feed_dict)
if i % 100 == 0:
print(loss_m_v, loss_std_v)
to_plot.append(loss_m_v)
to_plot_std.append(loss_std_v)
i += 128
plt.plot(to_plot)
plt.fill_between([m-std for m, std in zip(to_plot, to_plot_std)], [m+std for m, std in zip(to_plot, to_plot_std)])
plt.show()
saver.save(sess, "policy_model/policyinitial.ckpt")