forked from ACabrejas/Timescales_and_Discount_Factor_in_UTC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DQNAgents.py
242 lines (191 loc) · 10.4 KB
/
DQNAgents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from collections import deque
import numpy as np
import random
import PER
from General_agent import RLAgent
import tensorflow as tf
import tensorflow.keras.layers as kl
import tensorflow.keras.losses as kls
from tensorflow.python.keras import backend as K
from tensorflow.keras.models import load_model, Sequential, Model
from tensorflow.keras.layers import Dense, Input, Lambda, Flatten
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.losses as kls
from tensorflow.keras import regularizers
from tensorflow.keras.activations import relu
###
#leaky relu
lrelu = lambda x : relu(x, alpha =0.01)
###
######################################################################################
## Deep Q Learning Agent (Use DoubleDQN flag to swap to DDQN)
######################################################################################
class DQNAgent(RLAgent):
def __init__(self, state_size, action_size, ID, memory_size, gamma, epsilon, alpha, copy_weights_frequency, PER_activated, DoubleDQN, Dueling):
super().__init__(ID)
# Agent Junction ID and Controller ID
self.signal_id = ID
# Number of states, action space and memory
self.state_size = state_size
self.action_size = action_size
# Agent Hyperparameters
self.gamma = gamma # discount rate
self.epsilon = epsilon # exploration rate
self.learning_rate = alpha # learning rate
# Agent Architecture
self.DoubleDQN = DoubleDQN # Double Deep Q Network Flag
self.Dueling = Dueling # Dueling Q Networks Flag
self.PER_activated = PER_activated # Prioritized Experience Replay Flag
self.type = 'DQN' # Type of the agent
# Model and target networks
self.copy_weights_frequency = copy_weights_frequency # Frequency to copy weights to target network
self.model = self._build_model()
self.target_model = self._build_model()
self.target_model.set_weights(self.model.get_weights())
self.model.summary()
# Architecture Debug Messages
if self.DoubleDQN:
if self.Dueling:
print("Deployed instance of Dueling Double Deep Q Learning Agent(s) at Intersection " + str(ID) + "\n")
else:
print("Deployed instance of Double Deep Q Learning Agent(s) at Intersection " + str(ID) + "\n")
else:
if self.Dueling:
print("Deployed instance of Dueling Deep Q Learning Agent(s) at Intersection " + str(ID) + "\n")
else:
print("Deployed instance of Standard Deep Q Learning Agent(s) at Intersection " + str(ID) + "\n")
# Initial Setup of S, A, R, S_
self.state = np.zeros(state_size)[np.newaxis,:]
self.newstate = np.zeros(state_size)[np.newaxis,:]
self.action = 0
self.newaction = 0
self.reward = 0
# Metrics Storage Initialization
self.episode_reward = []
self.episode_memory = []
self.reward_storage = []
self.loss = []
# Metrics for the testing
self.queues_over_time = [[0,0,0,0]]
self.accumulated_delay= [0]
self.flow_in_intersection = []
if self.PER_activated:
# If PER_activated spawn BinaryTree and Memory object to store priorities and experiences
self.memory = PER.Memory(memory_size)
else:
# Else use the deque structure to only store experiences which will be sampled uniformly
self.memory = deque(maxlen=memory_size)
self.memory2 = deque(maxlen=1000000)
def _build_model(self):
'''
This method builds the neural network at the core of the agent
'''
if self.Dueling:
# Architecture for the Neural Net in the Dueling Deep Q-Learning Model
#model = Sequential()
input_layer = Input(shape = self.state_size )
# conv1 = kl.Conv2D(32, (3, 3), activation= 'relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name = 'value_conv1')(input_layer)
# conv2 = kl.Conv2D(64, (3, 3), activation= 'relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name = 'value_conv2')(conv1)
# conv3 = kl.Conv2D(64, (3, 3), activation= 'relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name = 'value_conv3')(conv2)
# flatten = Flatten()(conv3)
dense1 = Dense(24, activation= 'relu', kernel_regularizer=regularizers.l2(0.001))(input_layer)
dense2 = Dense(24, activation= 'relu', kernel_regularizer=regularizers.l2(0.001))(dense1)
fc1 = Dense(24)(dense2)
dueling_actions = Dense(self.action_size,kernel_regularizer=regularizers.l2(0.001))(fc1)
fc2 = Dense(24)(dense2)
dueling_values = Dense(1,kernel_regularizer=regularizers.l2(0.001))(fc2)
def dueling_operator(duel_input):
duel_v = duel_input[0]
duel_a = duel_input[1]
return (duel_v + (duel_a - K.mean(duel_a, axis = 1, keepdims = True)))
policy = Lambda(dueling_operator, name = 'policy')([dueling_values, dueling_actions])
model = Model(inputs=[input_layer], outputs=[policy])
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return(model)
else:
# Architecture for the Neural Net in Deep-Q learning Model (also Double version)
input_layer = Input(shape = self.state_size )
dense1 = Dense(48, activation= 'relu', kernel_regularizer=regularizers.l2(0.001))(input_layer)
dense2 = Dense(48, activation= 'relu', kernel_regularizer=regularizers.l2(0.001))(dense1)
dense3 = Dense(48, activation= 'relu', kernel_regularizer=regularizers.l2(0.001))(dense2)
dense4 = Dense(self.action_size, activation='linear', kernel_regularizer=regularizers.l2(0.01))(dense3)
model = Model(inputs=[input_layer], outputs=[dense4])
#model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
# Add memory on the right, if over memory limit, pop leftmost item
def remember(self, state, action, reward, next_state, done):
'''
This method operate in a different manner depending on whether PER memory is active or not.
PER active : arrange sarsd into an array and store it in the tree.
PER inactive: append sarsd into the right of a deque item, if the item is full, pop item on the leftmost position.
'''
if self.PER_activated:
experience = np.array([state, action, reward, next_state, done])
self.memory.store(experience)
self.episode_memory.append([state, action, reward, next_state, done])
self.episode_reward.append(reward)
else:
self.memory.append([state, action, reward, next_state, done])
self.episode_memory.append([state, action, reward, next_state, done])
self.episode_reward.append(reward)
def remember2(self, state, action, reward, next_state, done):
'''
This method operate in a different manner depending on whether PER memory is active or not.
PER active : arrange sarsd into an array and store it in the tree.
PER inactive: append sarsd into the right of a deque item, if the item is full, pop item on the leftmost position.
'''
self.memory2.append([state, action, reward, next_state, done])
def reset(self):
self.episode_memory = []
self.episode_reward = []
def choose_action(self, state):
'''
This method chooses an action using an epsilon-greedy method.
Input : State as an array.
Output: Action as an integer.
'''
if np.random.rand() <= self.epsilon:
action = random.randrange(self.action_size)
#print('Chosen Random Action {}'.format(action+1))
else:
act_values = self.model.predict(state)
action = np.argmax(act_values[0])
#print('Chosen Not-Random Action {}'.format(action+1))
return action
def learn_batch(self, batch_size, episode):
'''
Sample a batch of "batch_size" experiences.
Perform 1 step of gradient descent on all of them simultaneously.
Update priority weights in the memory tree
'''
state_vector = []
target_f_vector = []
absolute_errors = []
if self.PER_activated:
tree_idx, minibatch, ISWeights_mb = self.memory.sample(batch_size)
else:
idx = np.random.randint(len(self.memory), size=batch_size, dtype=int)
minibatch = np.array(self.memory)[idx]
state, action, reward, next_state = np.concatenate(minibatch[:,0], axis=0 ), minibatch[:,1].astype('int32') ,minibatch[:,2].reshape(batch_size,1), np.concatenate( minibatch[:,3] , axis=0 )
if self.DoubleDQN:
next_action = np.argmax(self.model.predict(next_state), axis=1)
target = reward + self.gamma * self.target_model.predict(next_state)[np.arange(batch_size),next_action].reshape(batch_size,1)
else:
# Fixed Q-Target
target = reward + self.gamma * np.max(self.target_model.predict(next_state),axis=1).reshape(batch_size,1)
# This section incorporates the reward into the prediction and calculates the absolute error between old and new
target_f = self.model.predict(state)
absolute_errors = np.abs(target_f[np.arange(batch_size),action].reshape(batch_size,1)-target)
target_f[np.arange(batch_size),action] = target.reshape(batch_size)
self.model.fit(state, target_f, epochs=1, verbose=2, batch_size = batch_size)
self.loss.append(self.model.history.history['loss'][0])
if self.PER_activated:
#Update priority
self.memory.batch_update(tree_idx, absolute_errors)
def copy_weights(self):
'''
This method copies the weights from the model to the target model.
'''
self.target_model.set_weights(self.model.get_weights())
print("Weights succesfully copied to Target model for Agent {}.".format(self.ID))