-
Notifications
You must be signed in to change notification settings - Fork 0
/
qlearning_skeleton.py
186 lines (157 loc) · 7.15 KB
/
qlearning_skeleton.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import numpy as np
from grid_world import *
import matplotlib.pyplot as plt
import math
import gym
# TODO: Fill this function in
# Function that takes an a 2d numpy array Q (num_states by num_actions)
# an epsilon in the range [0, 1] and the state
# to output actions according to an Epsilon-Greedy policy
# (random actions are chosen with epsilon probability)
def tabular_epsilon_greedy_policy(Q, eps, state, rand_flag):
# if rand_flag=True then no random actions are selected (truly greedy policy)
n_actions = Q.shape[1]
if rand_flag or np.random.rand() > eps: # flip to less than sign if eps>0.5
action = np.argmax(Q[state, :])
else:
action = np.random.randint(0, n_actions)
return action
class QLearning(object):
# Initialize a Qlearning object
# alpha is the "learning_rate"
def __init__(self, num_states, num_actions, alpha=0.5, gamma=0.9):
# initialize Q values to something
#self.Q = np.zeros((num_states, num_actions))
self.Q = np.random.random((num_states, num_actions))
self.alpha = alpha
self.gamma = gamma
# TODO: fill in this function
# updates the Q value table
# with a (state, action, reward, next_state) from one step in the environment
# done is a bool indicating if the episode terminated at this step
# you can return anything from this function to help with plotting/debugging
def update(self, state, action, reward, next_state, done):
a = np.argmax(self.Q[next_state, :])
if done: # at the terminal state Q[next_state,a]=0
self.Q[state,action] += self.alpha*(reward-self.Q[state,action])
else:
self.Q[state,action] += self.alpha*(reward+(self.gamma*self.Q[next_state,a])-self.Q[state,action])
# TODO: fill this in
# run the greedy policy (no randomness) on the environment for niter number of times
# and return the fraction of times that it reached the goal
def evaluate_greedy_policy(qlearning, env, niter=100):
tstep_rewards = []
for e in range(niter):
state = env.reset()
total = 0 # stores the sum of rewards per episode
num_goals = 0 # stores the number of times the goal is reached in niter-steps
done = False
while True:
action = tabular_epsilon_greedy_policy(qlearning.Q, eps, state, True) # we want NO randomness
next_state, reward, done = env.step(action)
total += reward
qlearning.update(state, action, reward, next_state, done)
state = next_state
if done:
tstep_rewards.append(total)
break
for i in range(len(tstep_rewards)):
if tstep_rewards[i] > 0:
num_goals += 1
# returns the fraction of times that the policy reached the goal
frac = num_goals/len(tstep_rewards)
return tstep_rewards, frac
# this function will take in an environment(GridWorld),
# a Qlearning object,
# an epsilon in the range [0, 1],
# and the number of episodes you want to run the algorithm for
# Returns the sum of the rewards at each timestep for each episode
def offpolicyTD(env, qlearning, num_episodes, eps):
tstep_rewards = []
qvalues = []
saveQ = np.zeros((env.get_num_states(), env.get_num_actions()))
randtime = 0
q_optimal = 0
for e in range(num_episodes):
state = env.reset()
episode_log = []
total = 0 # stores the sum of rewards at each timestep per episode
done = False
while True:
action = tabular_epsilon_greedy_policy(qlearning.Q, eps, state, False) # we want randomness
next_state, reward, done = env.step(action)
# append results to the episode log
episode_log.append([state, action])
# for part b of q2 - Save the Qvalue table at some earlier time before the Qvalues converge.
randtime += 1
if randtime==5:
saveQ = qlearning.Q
total += reward
qlearning.update(state, action, reward, next_state, done)
state = next_state
# if done, an episode has been complete, store the results for later
if done:
# collects the Q-value for each episode
episode_log = np.array(episode_log)
for i in range(len(episode_log)):
if episode_log[i,0]==0: # if at the starting state
qvalues.append(qlearning.Q[episode_log[i,0],episode_log[i,1]])
break
#qvalues.append(qlearning.Q[episode_log[-1,0],episode_log[-1,1]]) # for all states
# collects the sum of the rewards at each timestep per episode
tstep_rewards.append(total)
# finds the optimal Q-value(the max over actions of Q(state,action) for each state)
q_optimal = np.max(np.asarray(qvalues))
# iteration number when optimal value is first reached
#print(np.argmax(np.asarray(qvalues)))
break
return tstep_rewards, np.asarray(qvalues), q_optimal, saveQ, qlearning.Q
if __name__ == "__main__":
num_episodes = 1000
eps = 0.1
env = GridWorld(MAP2)
# for question 1 -- MAP3
qlearning = QLearning(env.get_num_states(), env.get_num_actions())
[tstep_rewards,_,_,_,_] = offpolicyTD(env, qlearning, num_episodes, eps)
plt.plot(tstep_rewards)
plt.xlabel("Number of Episodes")
plt.ylabel("Total Rewards")
plt.title("eps-Greedy policy")
plt.show()
# evaluate the greedy policy to see how well it performs
qlearning = QLearning(env.get_num_states(), env.get_num_actions())
[tstep_rewards2, frac] = evaluate_greedy_policy(qlearning, env)
print("Finding goal " + str(frac*100) + "% of the time.")
plt.plot(tstep_rewards[0:100], label='eps-Greedy policy')
plt.plot(tstep_rewards2, label='Greedy policy')
plt.xlabel("Number of Episodes")
plt.ylabel("Total Rewards")
plt.title("Comparsion of two policies")
plt.legend()
plt.show()
# for question 2 part a -- MAP2
qlearning = QLearning(env.get_num_states(), env.get_num_actions())
[_,qvalue,q_optimal,saveQ,Q] = offpolicyTD(env, qlearning, num_episodes, eps)
plt.plot(qvalue, color='c')
plt.axhline(y=q_optimal, color='r', linestyle='--', label='optimal Q value')
plt.xlabel("Number of Episodes")
plt.ylabel("Q Values")
plt.title("eps-Greedy policy")
plt.legend()
plt.show()
# for question 2 part b -- MAP2
#print(np.matrix(saveQ)) ### is this all we need to do?
# for question 3 -- MAP4
n_episode = 1000
n_rows = len(MAP4)
n_cols = len(MAP4[0])
qlearning = QLearning(env.get_num_states(), env.get_num_actions())
[_,_,_,_,Q] = offpolicyTD(env, qlearning, n_episode, eps)
temp = np.zeros(env.get_num_states())
for s in range(env.get_num_states()):
temp[s] = np.amax(Q[s,:])
newQ = np.reshape(temp,(n_rows,n_cols))
plt.imshow(newQ)
plt.colorbar()
plt.title("maximum Q-value matrix for "+str(n_episode)+" episodes")
plt.show()